## Topics 

In [1]:
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF
from nltk.corpus import stopwords
import pandas as pd
%reload_ext cypher
import re

In [2]:
stop_words_ = set(stopwords.words('english'))  
new_set = set(['000', 'de', 'rt', 'http', 'https', 'amp', '1', '25', 'pm', '2', 'rtudf0a', 'udf0a', 'lifeudc68udfff', 'udfa4udc69udffb', 'udfa8', 'udf08udf37udf3audd16', 'lmaooooooooooooooooooooooooooooooooooooooooooooude02ude02', 'udf31small', '' ])
stop_words_ = stop_words_.union(new_set)

## Topics by NMF

In [3]:
n_top_words = 10
n_features = 10000
n_components = 20
full_topics= {}

def train_NMF(data_samples):
    tfidf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=1,
                                   max_features=n_features,
                                   stop_words=stop_words_)
    tfidf = tfidf_vectorizer.fit_transform(data_samples)

    nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
    
    feature_names = tfidf_vectorizer.get_feature_names()
   
    print_top_words(nmf.components_, feature_names, n_top_words)
    
    return nmf, tfidf_vectorizer
      
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model):
        full_topics[topic_idx] = " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print("Topic #%d: " % topic_idx + full_topics[topic_idx] )
        print()
        
def clean_tweets(tweets):
    patternUrl = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    patternUsers =  re.compile('@(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    patternTags = re.compile('#(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    allTweets = pd.Series(tweets).str.cat(sep='\n|\n||\n')
    allTweets = patternUrl.sub('', allTweets) #removing urls 
    allTweets = patternUsers.sub('', allTweets) #removing users
    allTweets = patternTags.sub('', allTweets) #removing tags
    return allTweets.split('\n|\n||\n')

In [4]:
ecos = [ 'BLM', 'MT', 'CCH', 'GC']
for eco in ecos:
    tweetsQ = %cypher match  (n:tweet)<-[r :TWEETS]-(n2:user) where n.eco = '{eco}' and n.lang='en' return substring(n.text, 0, 10000000) as text
    training_data = clean_tweets(tweetsQ.get_dataframe().text.unique())
    print(eco, len(training_data), 'tweets -------------------------------------------------------------------')
    nmf, tfidf_vectorizer = train_NMF(training_data)

52663 rows affected.
BLM 22149 tweets -------------------------------------------------------------------
Topic #0: people call calling color get cops time white wrong need

Topic #1: 2018 nflnetwork tell may 00pm 00am 10 11 09 june

Topic #2: like follow party retweet looks back resisters great comment see

Topic #3: decal barack large custom jus got laptop clinton hillary huge

Topic #4: matter lives shirt tshirt via black blm blue saying support

Topic #5: latest thanks daily guns anomelodious fiorentina5 trail paper dont hands

Topic #6: police officer man brutality video officers call killed year old

Topic #7: know let world get need see even assault go mentions

Topic #8: black man women men community book woman person cop crime

Topic #9: king luther martin jr decal favor welcome proud religion would

Topic #10: america trump guns childish gambino video bless god heard great

Topic #11: white pro gun long woman racist privilege person supremacy supremacist

Topic #12: love teac