In [8]:
import pandas as pd

topics = pd.read_csv('data/part3/topics.csv')

topics.drop('Unnamed: 0', axis=1, inplace=True)
topics.to_csv('data/part3/topics.csv', index=False)

In [10]:
tweets = pd.read_csv('data/tweets/processed_all_tweets.csv')

In [12]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

    
common_words = get_top_n_words(tweets.filtered_text, 20)
unigram = pd.DataFrame(common_words, columns = ['unigram' , 'count'])

In [13]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3,3),stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in      vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_trigram(tweets.filtered_text, 10)
trigram = pd.DataFrame(common_words, columns = ['trigram' , 'count'])

In [14]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = CountVectorizer(
analyzer='word',       
min_df=3,# minimum required occurences of a word 
stop_words='english',# remove stop words
lowercase=True,# convert all words to lowercase
token_pattern='[a-zA-Z0-9]{2,}',# num chars > 3
max_features=5000,# max number of unique words
                            )
data_matrix = vectorizer.fit_transform(tweets.filtered_text)
data_matrix

<12963x3283 sparse matrix of type '<class 'numpy.int64'>'
	with 56461 stored elements in Compressed Sparse Row format>

In [90]:
lda_model = LatentDirichletAllocation(
    n_components=10, # Number of topics
    learning_method='online',
    random_state=20,       
    n_jobs = -1 )


lda_output = lda_model.fit_transform(data_matrix)

In [91]:
for i,topic in enumerate(lda_model.components_):
    print(f'Top 10 words for topic #{i}:')
    print([vectorizer.get_feature_names()[j] for j in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['dragon', 'public', 'fun', 'station', 'best', 'way', 'right', 'model', 'time', 'yes']


Top 10 words for topic #1:
['pretty', 'rocket', 'earth', 'engine', 'falcon', 'launch', 'coming', 'probably', 'starship', 'true']


Top 10 words for topic #2:
['battery', 'today', 'ago', 'safety', 'problem', 'hard', 'thanks', 'need', 'production', 'exactly']


Top 10 words for topic #3:
['especially', 'video', 'air', 'engineering', 'doesnt', 'electric', 'real', 'high', 'car', 'like']


Top 10 words for topic #4:
['wow', 'range', 'autopilot', 'times', 'days', 'important', 'hopefully', 'going', 'think', 'make']


Top 10 words for topic #5:
['free', 'later', 'definitely', 'little', 'design', 'know', 'love', 'people', 'actually', 'yeah']


Top 10 words for topic #6:
['making', 'cost', 'people', 'money', 'cool', 'future', 'use', 'lot', 'maybe', 'soon']


Top 10 words for topic #7:
['truth', 'say', 'power', 'solar', 'long', 'world', 'mass', 'day', 'better', 'sure']


Top 10 word

In [95]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

for i in range(10):
    words = [vectorizer.get_feature_names()[j] for j in lda_model.components_[i].argsort()[-10:]]
    frequency = [lda_model.components_[i][j] for j in lda_model.components_[i].argsort()[-10:]]
    
    print(frequency)
    wordcloud = WordCloud(background_color='white', prefer_horizontal=1, contour_color='steelblue', 
                          width=500, height=500, mode='RGBA',
                          max_font_size=100, min_font_size=30
                          
                          
                          
                          ).generate_from_frequencies(dict(zip(words, frequency)))

    wordcloud.to_image()

    wordcloud.to_file(f"data/figures/wordcloud_topic_{i+1}.png")



[91.21843482942255, 96.50084492769908, 102.56758385898026, 111.7469110494651, 167.93536748556897, 204.20450922620807, 211.54313580413577, 276.9305188416934, 316.14522729347885, 593.727875928518]
[155.45711651480593, 159.08945155546346, 159.36802622081325, 160.6849488263695, 170.7194747627197, 179.29532550353989, 213.67591672513203, 256.1062201098517, 259.6719615290022, 328.7636485683467]
[70.98844192884052, 74.69269792816566, 78.15351412371743, 88.121185228947, 109.23830915880804, 189.70462645251706, 196.51692707367616, 234.64349112472524, 251.63386688461975, 263.0401816592922]




[82.13200596201084, 82.8349320163697, 91.69475281169198, 95.09296899694286, 106.26415355460922, 106.49509820900498, 128.95400394255313, 301.65490762046187, 345.79887951626506, 423.38790615503154]
[83.85552439242804, 87.63802243718631, 93.3478286940599, 110.76145693369496, 113.81400805620146, 116.7345500945937, 116.89874605219055, 141.15580335400992, 177.94370000911897, 297.80317845006556]
[91.8207473377658, 96.07393857192152, 108.77276517734914, 111.99754644409104, 128.9754470864864, 132.062246893208, 148.05348160011417, 199.35064841706486, 207.54819583210926, 280.86203134235603]




[85.92578738970406, 91.32233867727933, 91.73658139335939, 105.64973866323145, 135.02813705176027, 145.5071229714664, 167.80659343958646, 187.67423774939908, 209.52193382472188, 252.22324799005384]
[79.66428482452346, 83.55261675145708, 87.65560665175781, 102.58693649353359, 103.07379616638931, 105.86230962157848, 107.30686423123781, 137.21811912651395, 226.91422504779757, 249.3228525719952]
[97.97576760342018, 126.55052820728159, 128.34364127230853, 158.03301776531586, 178.6244622405273, 228.76822291792013, 254.87375174459035, 270.2390320494172, 450.05852679702394, 506.9756591997732]
[80.49701135736007, 80.51338977567886, 85.22339023668135, 90.76019717395671, 95.03318898459445, 99.17090665720698, 114.39163071892209, 128.2093062461494, 133.92659497323353, 243.01749250706547]


