In [54]:
import pandas as pd

import matplotlib.pyplot as plt

In [55]:
df = pd.read_csv('OnceUponATimeInHollywood.csv')

In [56]:
df.head()

Unnamed: 0,comment_id,comment_created_utc,comment_edited,comment_is_submitter,comment_score,comment_body
0,euyw97v,1564109000.0,False,False,3823,Brad Pitt tripping on acid pointing the finger...
1,euz6ywt,1564114000.0,False,False,5204,“More than a brother but not quite a wife.”\n\...
2,euys7gs,1564108000.0,1564115096.0,False,5152,That scene where DiCaprio tells himself he won...
3,euyrqkr,1564107000.0,False,False,4869,That scene at the ranch when Cliff was going i...
4,euyril8,1564107000.0,False,False,4462,Jesus fucking Christ Tarantino loves feet


In [57]:
df.shape

(20427, 6)

In [58]:
#clean fot topic modelling

df.dropna(axis=1 ,how='any', inplace=True)

In [59]:
df.shape

(20427, 6)

In [60]:
df.isna().sum()

comment_id              0
comment_created_utc     0
comment_edited          0
comment_is_submitter    0
comment_score           0
comment_body            0
dtype: int64

In [61]:
columns = df.columns[:5]
df.drop(columns , inplace=True ,axis=1)

In [62]:
df['comment_body'].value_counts()[:5]

[deleted]       310
[removed]        58
Lol r/cringe     13
Lol               7
Yes               6
Name: comment_body, dtype: int64

In [64]:
df = df[df['comment_body'] != '[deleted]']

In [73]:
df = df[df['comment_body'] != '[removed]']

In [75]:
df.head()

Unnamed: 0,comment_body
0,Brad Pitt tripping on acid pointing the finger...
1,“More than a brother but not quite a wife.”\n\...
2,That scene where DiCaprio tells himself he won...
3,That scene at the ranch when Cliff was going i...
4,Jesus fucking Christ Tarantino loves feet


In [76]:
len(df)

20059

In [77]:
df.shape

(20059, 1)

In [79]:
df['comment_body'][1]

'“More than a brother but not quite a wife.”\n\nThat’s the take away line of the film for me.'

# first steps

In [81]:
from sklearn.feature_extraction.text import CountVectorizer

In [82]:
cv = CountVectorizer(max_df=0.95 , min_df=2 , stop_words='english')

dtm = cv.fit_transform(df['comment_body'])
dtm

<20059x10829 sparse matrix of type '<class 'numpy.int64'>'
	with 313617 stored elements in Compressed Sparse Row format>

In [84]:
from sklearn.decomposition import LatentDirichletAllocation

In [85]:
LDA = LatentDirichletAllocation(n_components=7 , random_state=42)
LDA.fit(dtm)

LatentDirichletAllocation(n_components=7, random_state=42)

In [92]:
LDA.get_params()

{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'batch',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 7,
 'n_jobs': None,
 'perp_tol': 0.1,
 'random_state': 42,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}

# Three steps away

## Grab the vocab

In [93]:
len(cv.get_feature_names())

10829

In [94]:
cv.get_feature_names()[500]

'anderson'

In [95]:
import random

In [96]:
for i in range(10):
    random_word_id = random.randint(0,10829)
    print(cv.get_feature_names()[random_word_id])

typical
finance
ensuring
magnificent
tatantino
delete
assumption
june
unbelievably
wp


In [98]:
LDA.components_

array([[1.51432316, 2.14278136, 0.14289984, ..., 0.14317044, 0.14285823,
        0.14285716],
       [0.14285723, 0.14285726, 0.14285717, ..., 1.71828658, 0.14527273,
        4.14296475],
       [0.14289799, 0.14293259, 1.28175656, ..., 0.14340975, 0.1428584 ,
        0.14390393],
       ...,
       [0.14316052, 0.14285721, 0.1593196 , ..., 2.56564692, 0.1428586 ,
        0.14285717],
       [0.15479275, 0.14285717, 0.14285715, ..., 0.14304933, 2.13547861,
        1.14170266],
       [0.14302619, 0.14285723, 2.98745251, ..., 0.14351676, 0.14781509,
        0.14285717]])

In [101]:
for index , topic in enumerate(LDA.components_):
    print(f' The top 15 words for topic {index+1}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

 The top 15 words for topic 1
['really', 'good', 'pulp', 'watch', 'fiction', 'basterds', 'films', 'think', 'just', 'like', 'movies', 'time', 'tarantino', 'movie', 'film']


 The top 15 words for topic 2
['real', 'car', 'com', 'dog', 'cigarette', 'https', 'got', 'going', 'girl', 'just', 'acid', 'did', 'scene', 'cliff', 'like']


 The top 15 words for topic 3
['time', 'flashback', 'like', 'character', 'think', 'movie', 'brad', 'sharon', 'pitt', 'scene', 'tate', 'rick', 'lee', 'bruce', 'cliff']


 The top 15 words for topic 4
['like', 'family', 'did', 'real', 'don', 'just', 'murders', 'think', 'sharon', 'didn', 'movie', 'people', 'tate', 'know', 'manson']


 The top 15 words for topic 5
['actor', 'character', 'think', 'like', 'hollywood', 'great', 'really', 'best', 'scene', 'film', 'tarantino', 'dalton', 'leo', 'movie', 'rick']


 The top 15 words for topic 6
['scenes', 'think', 'didn', 'movies', 'feel', 'plot', 'story', 'time', 'felt', 'really', 'film', 'tarantino', 'just', 'like', 'movi

In [123]:
topic_results = LDA.transform(dtm)

In [124]:
topic_results.shape

(20059, 7)

In [125]:
topic_results[0]

array([0.39828963, 0.55042207, 0.01029147, 0.01023021, 0.01027906,
       0.01025818, 0.01022938])

In [126]:
df.head()

Unnamed: 0,comment_body
0,Brad Pitt tripping on acid pointing the finger...
1,“More than a brother but not quite a wife.”\n\...
2,That scene where DiCaprio tells himself he won...
3,That scene at the ranch when Cliff was going i...
4,Jesus fucking Christ Tarantino loves feet


In [127]:
topic_results.argmax(axis=1)

array([1, 4, 4, ..., 1, 1, 1], dtype=int64)

In [131]:
df['Topic'] = topic_results.argmax(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Topic'] = topic_results.argmax(axis=1)


In [132]:
df

Unnamed: 0,comment_body,Topic
0,Brad Pitt tripping on acid pointing the finger...,1
1,“More than a brother but not quite a wife.”\n\...,4
2,That scene where DiCaprio tells himself he won...,4
3,That scene at the ranch when Cliff was going i...,6
4,Jesus fucking Christ Tarantino loves feet,6
...,...,...
20419,"I'm an electrical engineer dogg, if I'm retard...",6
20420,Lol r/cringe,1
20422,Lol r/cringe,1
20424,Lol r/cringe,1
