In [1]:
import numpy as np
import pandas as pd
import string
import re
import nltk

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

In [2]:
df = pd.read_csv('export_dataframe2.csv', sep=',')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67783 entries, 0 to 67782
Data columns (total 5 columns):
user_id    67783 non-null int64
content    67783 non-null object
spam       67783 non-null bool
mention    67783 non-null object
hashtag    67783 non-null object
dtypes: bool(1), int64(1), object(3)
memory usage: 2.1+ MB


In [3]:
df = df[['content','spam','user_id']]

In [4]:
df[:5]

Unnamed: 0,content,spam,user_id
0,RT @techsailorgroup ashley/techsailor at #tech...,True,10836
1,RT@techsailorgroup Isaiah Pang / Entrepreneur...,True,10836
2,RT@techsailorgroup Junji / turner at #techsail...,True,10836
3,RT@tehcsailorgroup Lester Kok / Straits Times ...,True,10836
4,RT@techsailorgroup Nor / Paddle Culture Inter...,True,10836


## Create function to remove punctuation, tokenize, remove stopwords, and stem

In [5]:
def clean_text(text):
    text = ''.join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if ps.stem(word) in top100_list]
    return text

### Apply to smaller sample

In [6]:
#sample = df[:20]

#tfidf_vect_sample = TfidfVectorizer(analyzer=clean_text)
#X_tfidf_sample = tfidf_vect_sample.fit_transform(sample['content'])
#print(X_tfidf_sample.shape)
#print(tfidf_vect_sample.get_feature_names())

### Get the 500 most frequent words

In [7]:
top_N = 100

In [8]:
text = df['content'].str.lower().str.cat(sep=' ')
text = ''.join([word for word in text if word not in string.punctuation])
tokens = re.split('\W+', text)
tokens_stem = [ps.stem(token) for token in tokens]
word_dist = nltk.FreqDist(tokens_stem)

In [9]:
tokens[:10]

['rt',
 'techsailorgroup',
 'ashleytechsailor',
 'at',
 'techsailor',
 'officewarming',
 'great',
 'party',
 'rttechsailorgroup',
 'isaiah']

In [10]:
stopwords = nltk.corpus.stopwords.words('english')
words_except_stop_dist = nltk.FreqDist(w for w in tokens_stem if w not in stopwords) 

In [11]:
print('All frequencies, NOT including STOPWORDS:')
print('=' * 60)
rslt = pd.DataFrame(words_except_stop_dist.most_common(top_N),
                    columns=['Word', 'Frequency'])
print(rslt)
print('=' * 60)

All frequencies, NOT including STOPWORDS:
               Word  Frequency
0                rt      34440
1                ff      13887
2            follow       6822
3      followfriday       4351
4             thank       4168
5                 u       3640
6               thi       3349
7               via       3180
8               new       3010
9              love       2850
10              get       2751
11                2       2489
12          twitter       2446
13              lol       2377
14               de       2163
15               im       2003
16                e       1958
17             quot       1904
18                1       1899
19             make       1872
20             free       1808
21               go       1747
22             like       1742
23            great       1664
24               fb       1660
25              one       1646
26              win       1616
27            pleas       1563
28             dont       1562
29             good       15

In [12]:
words_except_stop_dist.most_common(top_N)[:5]

top100_list = []
for a, b in words_except_stop_dist.most_common(top_N):
    top100_list.append(a)

top100_list[:5]

['rt', 'ff', 'follow', 'followfriday', 'thank']

In [13]:
print(top100_list)

['rt', 'ff', 'follow', 'followfriday', 'thank', 'u', 'thi', 'via', 'new', 'love', 'get', '2', 'twitter', 'lol', 'de', 'im', 'e', 'quot', '1', 'make', 'free', 'go', 'like', 'great', 'fb', 'one', 'win', 'pleas', 'dont', 'good', 'tcot', 'day', 'wa', 'video', '4', 'que', 'ha', 'tweet', 'nowplay', 'time', 'know', 'see', 'us', 'live', 'da', 'shoutout', 'music', 'news', 'got', 'hi', 'peopl', 'want', 'fail', 'musicmonday', 'need', 'best', '3', 'use', 'today', 'check', 'iranelect', 'look', 'say', 'um', 'think', 'work', 'show', 'back', 'eu', 'friday', 'ur', 'right', 'la', 'friend', 'year', 'money', 'take', 'man', 'tebakbandtransl', 'world', 'forex', 'come', 'watch', 'happi', 'market', 'iran', 'give', 'mm', 'blog', 'help', 'everi', 'por', 'tip', 'w', 'onli', 'life', 'busi', 'travel', 'let', 'justinbieb']


In [14]:
len(top100_list)

100

In [15]:
import matplotlib.pyplot as plt

rslt = pd.DataFrame(words_except_stop_dist.most_common(top_N),
                    columns=['Word', 'Frequency']).set_index('Word')

plt.style.use('ggplot')

rslt.plot.bar(rot=0)

<matplotlib.axes._subplots.AxesSubplot at 0x2360cfe53c8>

## Apply CountVectorizer

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=clean_text)
X_count = count_vect.fit_transform(df['content'])
print(X_count.shape)
print(count_vect.get_feature_names())

(67783, 100)
['1', '2', '3', '4', 'back', 'best', 'blog', 'busi', 'check', 'come', 'da', 'day', 'de', 'dont', 'e', 'eu', 'everi', 'fail', 'fb', 'ff', 'follow', 'followfriday', 'forex', 'free', 'friday', 'friend', 'get', 'give', 'go', 'good', 'got', 'great', 'ha', 'happi', 'help', 'hi', 'im', 'iran', 'iranelect', 'justinbieb', 'know', 'la', 'let', 'life', 'like', 'live', 'lol', 'look', 'love', 'make', 'man', 'market', 'mm', 'money', 'music', 'musicmonday', 'need', 'new', 'news', 'nowplay', 'one', 'onli', 'peopl', 'pleas', 'por', 'que', 'quot', 'right', 'rt', 'say', 'see', 'shoutout', 'show', 'take', 'tcot', 'tebakbandtransl', 'thank', 'thi', 'think', 'time', 'tip', 'today', 'travel', 'tweet', 'twitter', 'u', 'um', 'ur', 'us', 'use', 'via', 'video', 'w', 'wa', 'want', 'watch', 'win', 'work', 'world', 'year']


In [17]:
X_count_df = pd.DataFrame(X_count.toarray())
X_count_df.columns = count_vect.get_feature_names()
X_count_df[-5:]

Unnamed: 0,1,2,3,4,back,best,blog,busi,check,come,...,via,video,w,wa,want,watch,win,work,world,year
67778,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
67779,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
67780,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
67781,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
67782,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
vectorized_df = pd.concat([df['user_id'],X_count_df],axis = 1)
vectorized_df[:5]

Unnamed: 0,user_id,1,2,3,4,back,best,blog,busi,check,...,via,video,w,wa,want,watch,win,work,world,year
0,10836,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,10836,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,10836,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,10836,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,10836,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
user_to_term = vectorized_df.groupby(['user_id'])

In [20]:
user_to_term.sum()

Unnamed: 0_level_0,1,2,3,4,back,best,blog,busi,check,come,...,via,video,w,wa,want,watch,win,work,world,year
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1038,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1437,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3148,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0
9375,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10336,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10455,1,0,1,0,0,0,0,1,0,1,...,0,0,0,1,0,0,0,0,0,0
10836,0,0,0,0,0,11,0,2,0,1,...,0,0,0,0,0,0,0,0,2,0
10997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11718,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13602,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [21]:
user_to_term_sum=user_to_term.sum()
user_to_term_nonzeros =user_to_term_sum.loc[(user_to_term_sum!=0).any(axis=1)]

In [22]:
user_to_term_nonzeros

Unnamed: 0_level_0,1,2,3,4,back,best,blog,busi,check,come,...,via,video,w,wa,want,watch,win,work,world,year
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1038,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1437,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3148,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0
9375,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10336,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10455,1,0,1,0,0,0,0,1,0,1,...,0,0,0,1,0,0,0,0,0,0
10836,0,0,0,0,0,11,0,2,0,1,...,0,0,0,0,0,0,0,0,2,0
11718,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13602,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
25723,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
#export = user_to_term_nonzeros.to_csv(r'user_to_term.csv',index = None, header=True)