In [15]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline

### Loading in Data

In [16]:
df1 = pd.read_csv('./russian-troll-tweets/IRAhandle_tweets_1.csv')
df2 = pd.read_csv('./russian-troll-tweets/IRAhandle_tweets_2.csv')
df3 = pd.read_csv('./russian-troll-tweets/IRAhandle_tweets_3.csv')
df4 = pd.read_csv('./russian-troll-tweets/IRAhandle_tweets_4.csv')
df5 = pd.read_csv('./russian-troll-tweets/IRAhandle_tweets_5.csv')
df6 = pd.read_csv('./russian-troll-tweets/IRAhandle_tweets_6.csv')
df7 = pd.read_csv('./russian-troll-tweets/IRAhandle_tweets_7.csv')
df8 = pd.read_csv('./russian-troll-tweets/IRAhandle_tweets_8.csv')
df9 = pd.read_csv('./russian-troll-tweets/IRAhandle_tweets_9.csv')
df10 = pd.read_csv('./russian-troll-tweets/IRAhandle_tweets_10.csv')
df11 = pd.read_csv('./russian-troll-tweets/IRAhandle_tweets_11.csv')
df12 = pd.read_csv('./russian-troll-tweets/IRAhandle_tweets_12.csv')
df13 = pd.read_csv('./russian-troll-tweets/IRAhandle_tweets_13.csv')

In [17]:
frames = [df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13]
master = pd.concat(frames)

In [18]:
mask = master['language'] == 'English'
eng = master[mask]

In [19]:
content_df = eng['content']

In [20]:
content_df = content_df.drop_duplicates()

In [22]:
content_df = content_df.dropna()

### Vectorizing, Stemming, and Tokenzing

In [23]:
punc = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}',"%"]
stop_words = text.ENGLISH_STOP_WORDS.union(punc)
desc = content_df.values
vectorizer = TfidfVectorizer(stop_words = stop_words)
X = vectorizer.fit_transform(desc)

In [25]:
word_features = vectorizer.get_feature_names()
print(len(word_features))

2378221


In [26]:
stemmer = SnowballStemmer('english')
tokenizer = RegexpTokenizer(r'[a-zA-Z\']+')

def tokenize(text):
    return [stemmer.stem(word) for word in tokenizer.tokenize(text.lower())]

In [27]:
vectorizer2 = TfidfVectorizer(stop_words = stop_words, tokenizer = tokenize)
X2 = vectorizer2.fit_transform(desc)
word_features2 = vectorizer2.get_feature_names()
print(len(word_features2))
print(word_features2[:50])

2050047
["''", "''twas", "'a", "'aaron'", "'abov", "'adopts'", "'advic", "'b", "'bama", "'bird", "'bless", "'bori", "'bout", "'bronx", "'c", "'china", "'d", "'dear", "'discoveref", "'duck", "'e", "'ebtih", "'em", "'energizer'", "'f", "'fences'", "'forev", "'g", "'guy", "'h", "'hand", "'happi", "'hate", "'he", "'he'd", "'healthi", "'heeey", "'henri", "'honesti", "'how", "'i", "'i'm", "'if", "'increas", "'is", "'islamophobia", "'it", "'j", "'justic", "'k"]


In [28]:
vectorizer3 = TfidfVectorizer(stop_words = stop_words, tokenizer = tokenize, max_features = 1000)
X3 = vectorizer3.fit_transform(desc)
words = vectorizer3.get_feature_names()

### Clustering

In [29]:
kmeans = KMeans(n_clusters = 8, n_init = 20, n_jobs = 1)
kmeans.fit(X3)
# Going to use 8 clusters to replicate the 8 category types the original Clemson researchers classified
common_words = kmeans.cluster_centers_.argsort()[:,-1:-26:-1]
for num, centroid in enumerate(common_words):
    print(str(num) + ' : ' + ', '.join(words[word] for word in centroid))

0 : https, t, rt, polit, peopl, like, just, local, amp, say, new, black, obama, don't, man, polic, i'm, make, time, year, love, need, hillari, day, know
1 : s, https, t, u, trump, break, obama, new, video, just, hillari, n, http, o, k, m, topnew, c, d, p, news, e, h, f, b
2 : http, t, workout, exercis, weight, lose, need, good, k, m, fit, d, j, https, r, b, c, x, e, n, l, diet, p, g, u
3 : sport, win, game, open, warrior, lsu, final, nfl, cleveland, coach, s, new, lead, beat, state, say, team, player, season, footbal, report, star, miss, bowl, play
4 : news, world, t, say, https, kill, polic, s, u, state, new, local, man, attack, china, report, suspect, arrest, year, syria, dead, shoot, islam, south, eu
5 : trump, https, t, donald, presid, polit, support, rt, just, break, video, say, obama, clinton, anti, vote, media, hillari, look, maga, gop, new, attack, elect, liber
6 : want, z, https, t, don't, rt, peopl, just, http, trump, know, man, like, say, make, hillari, u, amp, new, polic, n