In [1]:
# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
#configure
# sets matplotlib to inline and displays graphs below the corressponding cell.
%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

#import nltk
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize

#preprocessing
from nltk.corpus import stopwords  #stopwords
from nltk import word_tokenize,sent_tokenize # tokenizing
from nltk.stem import PorterStemmer,LancasterStemmer  # using the Porter Stemmer and Lancaster Stemmer and others
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet

# for named entity recognition (NER)
from nltk import ne_chunk

# vectorizers for creating the document-term-matrix (DTM)
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

#stop-words
stop_words=set(nltk.corpus.stopwords.words('english'))

In [2]:
df=pd.read_csv(r'D:\\CoronaTweets\\corona_tweet41.csv')

In [3]:
df.head()


Unnamed: 0,id,text,user_id,user_screen_name,user_name
0,1305389523602472962,@YoWaybes Eww 😷 gonna need a mask for that thi...,1221168153008279552,LmaoNoxy,Noxy
1,1305389500504272897,Great words by @supriya_sule from @NCPspeaks t...,1267079862084227073,mannykapron,Manish kapruwan
2,1305389495890653184,"Al die Corona, alleen maar voor een lagere sne...",14183892,hoxha,hoxma
3,1305389473195122688,👉Increasing Corona Cases\n👉 Unemployment\n👉Fal...,2186187964,Ranveer_Chelsea,RANVEER
4,1305389472163540992,@realDonaldTrump Donald Trump has already prov...,3108330785,wizzzzdoom,Hussein Barrak Obama


In [4]:
df.drop(['id','user_id','user_screen_name','user_name'],axis=1,inplace=True)

In [5]:
df.head()

Unnamed: 0,text
0,@YoWaybes Eww 😷 gonna need a mask for that thi...
1,Great words by @supriya_sule from @NCPspeaks t...
2,"Al die Corona, alleen maar voor een lagere sne..."
3,👉Increasing Corona Cases\n👉 Unemployment\n👉Fal...
4,@realDonaldTrump Donald Trump has already prov...


In [6]:
def clean_text(headline):
  le=WordNetLemmatizer()
  word_tokens=word_tokenize(headline)
  tokens=[le.lemmatize(w) for w in word_tokens if w not in stop_words and len(w)>3]
  cleaned_text=" ".join(tokens)
  return cleaned_text

In [7]:
df['text']=df['text'].apply(clean_text)


In [8]:
df.head()

Unnamed: 0,text
0,YoWaybes need mask thing already corona
1,Great word supriya_sule NCPspeaks today Indian...
2,Corona alleen maar voor lagere snelheid Review...
3,👉Increasing Corona Cases Unemployment 👉Falling...
4,realDonaldTrump Donald Trump already proven ha...


In [16]:
vect =TfidfVectorizer(stop_words=stop_words,max_features=1000,ngram_range = (1,3)) # to play with. min_df,max_df,max_features etc...
vect_text=vect.fit_transform(df['text'])
#vectorizer = TfidfVectorizer(ngram_range = (1,3)) 


In [17]:
print(vect_text.shape)
print(vect_text)

(500, 1000)
  (0, 164)	0.12540249430407818
  (0, 53)	0.5220990919426999
  (0, 863)	0.5088718787872414
  (0, 548)	0.4402108296873329
  (0, 605)	0.5088718787872414
  (1, 615)	0.36278174370270677
  (1, 440)	0.09169256435105191
  (1, 614)	0.36278174370270677
  (1, 142)	0.09169256435105191
  (1, 439)	0.09169256435105191
  (1, 611)	0.30576703089252477
  (1, 280)	0.38068229662866276
  (1, 878)	0.2848305886998597
  (1, 837)	0.38068229662866276
  (1, 978)	0.38068229662866276
  (1, 396)	0.31965177844159326
  (2, 818)	0.43846527661663537
  (2, 361)	0.6036267684437622
  (2, 753)	0.6036267684437622
  (2, 440)	0.1453918051869288
  (2, 142)	0.1453918051869288
  (2, 439)	0.1453918051869288
  (2, 164)	0.12490524004715525
  (3, 704)	0.3959312623932995
  (3, 848)	0.3959312623932995
  :	:
  (498, 786)	0.44656488248640824
  (498, 516)	0.48724794690916406
  (498, 838)	0.4320444682870934
  (498, 771)	0.3590694389610643
  (498, 164)	0.10082359655466935
  (499, 752)	0.27371696885986846
  (499, 118)	0.273716968

In [12]:
from sklearn.decomposition import TruncatedSVD
lsa_model = TruncatedSVD(n_components=10, algorithm='randomized', n_iter=10, random_state=42)

lsa_top=lsa_model.fit_transform(vect_text)

In [13]:
print(lsa_top)
print(lsa_top.shape)  # (no_of_doc*no_of_topics)

[[ 0.11406503 -0.03811296  0.11911714 ...  0.02627249 -0.09427033
  -0.06186018]
 [ 0.13310887 -0.07410099 -0.09029538 ... -0.00098959 -0.01359138
  -0.19445678]
 [ 0.22416288 -0.10098913 -0.00213665 ... -0.0244589  -0.03790002
  -0.00308639]
 ...
 [ 0.12570182 -0.03124555  0.16368946 ... -0.01384268 -0.01358427
  -0.12330394]
 [ 0.09674367 -0.02743044  0.13371369 ... -0.01501533 -0.04372872
  -0.10671349]
 [ 0.22524684  0.13150344 -0.03736298 ... -0.05591091 -0.04736634
  -0.13834789]]
(500, 10)


In [14]:
l=lsa_top[0]
print("Document 0 :")
for i,topic in enumerate(l):
  print("Topic ",i," : ",topic*100)

Document 0 :
Topic  0  :  11.406502992014133
Topic  1  :  -3.811296009449048
Topic  2  :  11.91171360692925
Topic  3  :  -1.1093390254299493
Topic  4  :  -2.1362754280165506
Topic  5  :  5.521906755786332
Topic  6  :  0.6997987565650933
Topic  7  :  2.6272494584871664
Topic  8  :  -9.427032710776317
Topic  9  :  -6.186017590855605


In [15]:
vocab = vect.get_feature_names()

for i, comp in enumerate(lsa_model.components_):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10]
    print("Topic "+str(i)+": ")
    for t in sorted_words:
        print(t[0],end=" ")
    print("\n")

Topic 0: 
corona co http virus case total cases odisha district people 

Topic 1: 
cases total odisha district case virus q4my3bdkor khordha deaths 2010 

Topic 2: 
corona like school back miss death shit test realdonaldtrump much 

Topic 3: 
virus pandemic making lost weekly profit corona help business sbhdkzpswz 

Topic 4: 
people virus trump like realdonaldtrump gopchairwoman even know president real 

Topic 5: 
people india today deaths like recovered update active confirmed rahulgandhi 

Topic 6: 
like year last even tell unemployment lost gdp unplanned making 

Topic 7: 
year last highest unemployment gdp unplanned rate virus lockdown spreadin 

Topic 8: 
realdonaldtrump like trump deaths active confirmed india 4850887 79784 990502 

Topic 9: 
time like recovered work people deaths update virus 4850887 79784 

