In [1]:
import pandas as pd
import random

from mongo_aggregation_verbs import *

from lib import create_mongo_client_to_database_collection

collection_reference = create_mongo_client_to_database_collection('twitter', 'tweets')

- https://alexisperrier.com/nlp/2015/09/16/segmentation_twitter_timelines_lda_vs_lsa.html
- https://alexisperrier.com/nlp/2015/09/04/topic-modeling-of-twitter-followers.html

In [2]:
match_empty_url_arrays = { MATCH : { "entities.urls" : [] } }

list(collection_reference.aggregate(
    [
        match_empty_url_arrays,
        { COUNT : "text" }
    ]
))

[{'text': 211518}]

In [3]:
job_hashtags = ['job', 'jobs', 'hiring', 'careerarc']
location_hashtags = ['california', 'losangeles', 'la', 'santamonica', 'glendale', 'paloalto']
match_not_in_bad = { MATCH : { "text" : { "$in" : job_hashtags + location_hashtags } } }
project_to_text_keep_id = { PROJECT : { "text" : "$entities.hashtags.text" } }
project_to_id = { PROJECT : { "_id" : 1 } }

bad_ids = list(collection_reference.aggregate(
    [
        match_non_empty_hashtag_arrays,
        project_to_text_keep_id,
        unwind_text,
        project_to_lower,
        match_not_in_bad,
        project_to_id
    ]
))
bad_ids[:10], len(bad_ids)

([{'_id': ObjectId('5b568e378c4e2000a5220167')},
  {'_id': ObjectId('5b569cc0987491012707420a')},
  {'_id': ObjectId('5b569cc09874910127074228')},
  {'_id': ObjectId('5b569cc09874910127074228')},
  {'_id': ObjectId('5b569cc09874910127074228')},
  {'_id': ObjectId('5b56a9261a6d22025b8bfd5b')},
  {'_id': ObjectId('5b56a9a01a6d22025b8bff7f')},
  {'_id': ObjectId('5b56a9ad1a6d22025b8bffbb')},
  {'_id': ObjectId('5b56a9c81a6d22025b8c0047')},
  {'_id': ObjectId('5b56aa001a6d22025b8c014d')}],
 9904)

In [4]:
bad_ids = [bad_id['_id'] for bad_id in bad_ids]
bad_ids[:10]

[ObjectId('5b568e378c4e2000a5220167'),
 ObjectId('5b569cc0987491012707420a'),
 ObjectId('5b569cc09874910127074228'),
 ObjectId('5b569cc09874910127074228'),
 ObjectId('5b569cc09874910127074228'),
 ObjectId('5b56a9261a6d22025b8bfd5b'),
 ObjectId('5b56a9a01a6d22025b8bff7f'),
 ObjectId('5b56a9ad1a6d22025b8bffbb'),
 ObjectId('5b56a9c81a6d22025b8c0047'),
 ObjectId('5b56aa001a6d22025b8c014d')]

In [5]:
not_in_bad_ids = { "$nin" : bad_ids }

In [6]:
not_in_bad_ids_and_no_url = { 
    "_id"           : not_in_bad_ids, 
    "entities.urls" : []
}

just_the_text = {
    "text" : 1,
    "_id"  : 0
}

In [7]:
collection_reference.find_one(
    not_in_bad_ids_and_no_url,
    just_the_text
)

{'text': 'Can’t wait to go home and play video games already smh'}

In [8]:
cur  = collection_reference.find(
    not_in_bad_ids_and_no_url,
    just_the_text
)

#tweets = list(cur)
tweets = [next(cur) for _ in range(20000)] #너무 많아서 줄여줌
tweet_text = pd.DataFrame(tweets)

In [9]:
len(tweet_text)

20000

In [10]:
tweet_text.head()

Unnamed: 0,text
0,Can’t wait to go home and play video games alr...
1,Can’t wait to go home and play video games alr...
2,ya https://t.co/ZOjRvdvOFB
3,Let’s go #Dodgers
4,"“Chunky peanut butter is rough on the teeth, l..."


In [11]:
tweet_text.text = tweet_text.text.str.replace('http\S+|www.\S+', '', case=False)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf.fit(tweet_text.text)
word_occurence = tfidf.transform(tweet_text.text) #.todense()

In [None]:
word_occurence.shape

In [None]:
words = tfidf.get_feature_names()
word_sample = random.sample(words, 20)
word_occurence_m = pd.DataFrame(word_occurence, columns=words)
word_occurence_m[word_sample].head()

In [None]:
from sklearn.decomposition import TruncatedSVD #TruncatedSVD 

In [None]:
#lda = LatentDirichletAllocation(n_topics=10, learning_method='batch')
lda = TruncatedSVD(500)
lda.fit(word_occurence)

In [None]:
lda_df = pd.DataFrame(lda.components_, columns=words).T

In [None]:
def filter_topic(lda_df, index, threshold):
    return (lda_df[lda_df[index] > threshold][index]
            .sort_values(ascending=False))

In [None]:
filter_topic(lda_df, 0, 2)

In [None]:
filter_topic(lda_df, 1, 2)

In [None]:
filter_topic(lda_df, 2, 2)

In [None]:
filter_topic(lda_df, 3, 2)

In [None]:
filter_topic(lda_df, 4, 2)

In [None]:
filter_topic(lda_df, 5, 2)

In [None]:
filter_topic(lda_df, 6, 2)

In [None]:
filter_topic(lda_df, 7, 2)