### Imports

In [186]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import glob
import os

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer 
from nltk.stem import WordNetLemmatizer 
from nltk.probability import FreqDist
import nltk

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn import preprocessing

from yellowbrick.cluster import KElbowVisualizer
from datetime import datetime
import re

In [187]:
pd.set_option('max_colwidth', 800)

### Functions

In [188]:
def clean_up(tweet):
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',tweet.lower())
    tweet = re.sub('[\s]+', ' ', tweet)  
    tweet = re.sub(r'\W*\b\w{1,3}\b', '', tweet)
    tweet = re.sub('[^A-Za-z0-9]+', ' ', tweet) 
    return tweet

def tokenize(tweet):
    return word_tokenize(tweet)

def stem_and_lemmatize(tweet):
    tweet = ' '.join(tweet)
    stem = PorterStemmer().stem(tweet)
    return WordNetLemmatizer().lemmatize(stem)

def remove_stopwords(tweet):
    stop_words = set(stopwords.words('english')) 
    return [i for i in tweet.split() if i not in stop_words]

#function to detect language based on # of stop words for particular language
stopwords_dict = {lang: set(nltk.corpus.stopwords.words(lang)) for lang in nltk.corpus.stopwords.fileids()}
def get_language(text):
    words = set(nltk.wordpunct_tokenize(text.lower()))
    lang = max(((lang, len(words & stopwords)) for lang, stopwords in stopwords_dict.items()), key = lambda x: x[1])[0]
    return True if lang == 'english'else False
    
def get_pm(row):
    pms = []
    text = row["TEXT"].lower()
    if "boris" in text or "johnson" in text:
        pms.append("Boris Johnson")
    elif "theresa" in text:
        pms.append("Theresa May")
    else:
        pms.append("none") 
    return ",".join(pms)

### Data

In [189]:
path = r'/Users/ironhack/Documents/GitHub/IronHack/W9FinalProject/final-project/your-project/tweets/2019'
all_files = glob.glob(os.path.join(path, "*.csv"))
list_of_files = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    list_of_files.append(df)

df = pd.concat(list_of_files, axis=0, ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if __name__ == '__main__':


In [190]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383269 entries, 0 to 383268
Data columns (total 16 columns):
author_id         380000 non-null float64
date              383269 non-null object
favorites         380000 non-null float64
formatted_date    380000 non-null object
geo               0 non-null float64
hashtags          93174 non-null object
id                383269 non-null int64
mentions          49660 non-null object
permalink         380000 non-null object
pm                3269 non-null object
replies           380000 non-null float64
retweets          380000 non-null float64
text              382645 non-null object
to                235693 non-null object
urls              117328 non-null object
username          383269 non-null object
dtypes: float64(5), int64(1), object(10)
memory usage: 46.8+ MB


In [191]:
df.isna().sum()*100/len(df)

author_id           0.852926
date                0.000000
favorites           0.852926
formatted_date      0.852926
geo               100.000000
hashtags           75.689659
id                  0.000000
mentions           87.043043
permalink           0.852926
pm                 99.147074
replies             0.852926
retweets            0.852926
text                0.162810
to                 38.504549
urls               69.387558
username            0.000000
dtype: float64

In [192]:
df.columns

Index(['author_id', 'date', 'favorites', 'formatted_date', 'geo', 'hashtags',
       'id', 'mentions', 'permalink', 'pm', 'replies', 'retweets', 'text',
       'to', 'urls', 'username'],
      dtype='object')

In [193]:
df = df[['date','id', 'username', 'text']]
df.columns = map(str.upper, df.columns)

In [194]:
df.shape

(383269, 4)

In [195]:
df.dropna(inplace=True)

In [196]:
#check types
df.dtypes

DATE        object
ID           int64
USERNAME    object
TEXT        object
dtype: object

In [197]:
#check for duplicates in the tweet ID column
df[df.duplicated(subset = 'ID')]

Unnamed: 0,DATE,ID,USERNAME,TEXT
30000,2019-03-12 23:59:59+00:00,1105619479478845441,El_7usseny,طب سريعاً كدا فيه غفلة من رجال الأمن أحب أتكلم شويه عن الـBrexit اللي هو انفصال المملكة المتحدة UK عن الاتحاد الأوروبي في ثريد صغير
30001,2019-03-12 23:59:59+00:00,1105619476047953928,michael_b28,"@heuteshow ist das eigentlich richtig, dass der simulierte Angriff auf den EU Luftraum gestern Abend durch TUIfly mit #737MAX8 eine Aktionskunst zur Verhinderung des #Brexit war?"
30002,2019-03-12 23:59:57+00:00,1105619470641446912,repnews,"Listen to this. What is Britain's ""fundamental interest"" in Ireland? If they answer that first, then they can figure out the border, then they can figure out Brexit."
30003,2019-03-12 23:59:54+00:00,1105619456565395458,Lusa_noticias,Portugal prepara-se para cenário do 'Brexit' sem acordo que é “hoje mais possível” - MNE - https://www.lusa.pt/article/25807673
30004,2019-03-12 23:59:54+00:00,1105619455739158529,themarketsniper,"Parliament’s rejection of Mrs. May’s deal shifts the focus to a vote scheduled for Wednesday on whether to oppose leaving without a deal - WATCH TOMORROW'S BETRAYAL OF REAL BREXIT, AS MAJORITY VOTE AGAINST NO DEAL EXIT. 99% CERTAIN OF IT. @TheResetSniper @DollarVigilante"
...,...,...,...,...
306811,2019-09-13 21:52:15+00:00,1172629085387968513,ittatto23,"This absolute gobshite has a nerve since he’s the very one who’s responsible for all this #Brexit shite in the first place, then he pissed off when it all went wrong &amp; let other politicians in Parliament like #TheresaMay to deal with it all! #DavidCameron"
306976,2019-09-13 21:47:57+00:00,1172628003987345408,DarrylInnes,your fantasy Brexit was lost the moment that Theresa May called a snap election and ended up with a minority government. Bercow has just given the majority of opposition a voice as he is supposed to do and as the election mandated.
307814,2019-09-13 21:28:25+00:00,1172623090922196999,sundersays,"David Cameron and Theresa May, for somewhat different reasons, are in an especially lonely place in a politics polarised by Brexit. Distrusted as ""Remainers"" by Leave advocates, but probably more unpopular still with Remainers"
308094,2019-09-13 21:21:49+00:00,1172621430196002817,femiokes,I see your argument. But would rather he resign if he felt he can't can it through. What wrong with a bit of honesty? #TheresaMay took over promising to deliver but could not have done more to mess up #brexit instead!


In [198]:
#removing duplicates
df.drop_duplicates(subset='ID', keep="first", inplace=True)

In [199]:
#now that there are no duplicates, so we can drop the column
df.drop('ID', inplace=True, axis=1)

In [200]:
#checking totalrows
df.shape

(367693, 3)

### Fixing time type column

In [201]:
#from the date only the month is important to analyse the change throughout the year
df['DATE'] = pd.to_datetime(df['DATE'])
df['MONTH_INT'] = pd.DatetimeIndex(df['DATE']).month
df['MONTH_STR'] = df['DATE'].dt.strftime('%b')
df.drop('DATE', inplace=True, axis=1)

### Removing non english tweets

In [202]:
df['language'] = df['TEXT'].apply(get_language)
df = df[(df['language'] == True)]
df.drop('language', inplace=True, axis=1)

In [203]:
df.shape

(329937, 4)

### Creating column for Theresa May/Boris Jonhson - selecting tweets that only refer the PMs

In [204]:
df["PM"] = df.apply(get_pm,axis=1)

In [205]:
df['PM'].value_counts()

none             282333
Boris Johnson     33555
Theresa May       14049
Name: PM, dtype: int64

### Adjusting dataset so we have a more balanced number of tweets between May&Boris

In [206]:
df['PM'].value_counts()/len(df)

none             0.855718
Boris Johnson    0.101701
Theresa May      0.042581
Name: PM, dtype: float64

In [207]:
#removing 5% of Boris tweets
cdf = df.drop(df[df['PM'] == 'Boris Johnson'].sample(frac=.5).index)

In [208]:
cdf['PM'].value_counts()

none             282333
Boris Johnson     16777
Theresa May       14049
Name: PM, dtype: int64

In [172]:
df_may = df[df['PM'] == 'Theresa May']
df_boris = df[df['PM'] == 'Boris Johnson']
df_none = df[df['PM'] == 'none']
df_mb = df_may.append(df_boris)

In [96]:
df_mb['PM'].value_counts()

Boris Johnson    33555
Theresa May      14049
Name: PM, dtype: int64

### Dataset with the "None" group and the group that mentions May & Boris

### Remove user accounts that are related to news and trolls?

In [209]:
#find twitter accounts with brexit in the name, these are to be removed
brexit_usernames = cdf[cdf['USERNAME'].str.contains('brexit|Brexit')]

In [210]:
cond = cdf['USERNAME'].isin(brexit_usernames['USERNAME'])
#cond.value_counts()
cdf.drop(cdf[cond].index, inplace = True)

In [211]:
cdf.shape

(306910, 5)

In [212]:
'''checking top users with most frequent tweets, these look like bots/trolls/news and do not add any meaningful value 
to the analysis so they will be removed'''
usernames_to_remove = cdf['USERNAME'].value_counts().sort_values(ascending=False).nlargest(5)
frame = usernames_to_remove.to_frame().reset_index()
frame

Unnamed: 0,index,USERNAME
0,AtlantoCeltica,615
1,dddoc_blogger,288
2,Doozy_45,254
3,BBCPropaganda,248
4,JeanneBartram,201


In [213]:
cond1 = cdf['USERNAME'].isin(frame['index'])
#cond.value_counts()
cdf.drop(cdf[cond1].index, inplace = True)

In [214]:
cdf.shape

(305304, 5)

In [215]:
cdf['PM'].value_counts()

none             275161
Boris Johnson     16408
Theresa May       13735
Name: PM, dtype: int64

In [216]:
cdf['PM'].value_counts()/len(df)

none             0.833980
Boris Johnson    0.049731
Theresa May      0.041629
Name: PM, dtype: float64

### Cleaning the tweets

In [217]:
cdf['TWEET_PROCESSED'] = cdf['TEXT'].apply(clean_up).apply(tokenize).apply(stem_and_lemmatize).apply(remove_stopwords)
cdf['TWEET_CLEANED'] = [' '.join(map(str, l)) for l in cdf['TWEET_PROCESSED']]

In [None]:
df_mb['TWEET_PROCESSED'] = df_mb['TEXT'].apply(clean_up).apply(tokenize).apply(stem_and_lemmatize).apply(remove_stopwords)
df_mb['TWEET_CLEANED'] = [' '.join(map(str, l)) for l in df_mb['TWEET_PROCESSED']]

df_may['TWEET_PROCESSED'] = df_may['TEXT'].apply(clean_up).apply(tokenize).apply(stem_and_lemmatize).apply(remove_stopwords)
df_may['TWEET_CLEANED'] = [' '.join(map(str, l)) for l in df_may['TWEET_PROCESSED']]

df_boris['TWEET_PROCESSED'] = df_boris['TEXT'].apply(clean_up).apply(tokenize).apply(stem_and_lemmatize).apply(remove_stopwords)
df_boris['TWEET_CLEANED'] = [' '.join(map(str, l)) for l in df_boris['TWEET_PROCESSED']]

df_test['TWEET_PROCESSED'] = df_test['TEXT'].apply(clean_up).apply(tokenize).apply(stem_and_lemmatize).apply(remove_stopwords)
df_test['TWEET_CLEANED'] = [' '.join(map(str, l)) for l in df_test['TWEET_PROCESSED']]


In [218]:
cdf

Unnamed: 0,USERNAME,TEXT,MONTH_INT,MONTH_STR,PM,TWEET_PROCESSED,TWEET_CLEANED
3,DGAll41,"Brexit minister Stephen Barclay tells Michel Barnier that UK will leave EU on October 31 with or without a deal - Evening Standard. Amazing Gove has just been bollocked for similar terminology and less than 24 hrs later, dick Barclay blunders in.",7,Jul,none,"[brexit, minister, stephen, barclay, tells, michel, barnier, leave, october, without, deal, evening, standard, amazing, gove, bollocked, similar, terminology, less, later, dick, barclay, blund]",brexit minister stephen barclay tells michel barnier leave october without deal evening standard amazing gove bollocked similar terminology less later dick barclay blund
4,BioMickWatson,This isn't a superhero movie where the bad guy gets to be a good guy because they share common cause. Brexit and Scottish indy *are* the common cause we need to fight against,7,Jul,none,"[superhero, movie, gets, good, share, common, cause, brexit, scottish, indy, common, cause, need, fight]",superhero movie gets good share common cause brexit scottish indy common cause need fight
5,Timlagor,Labour has to promise #2ndRef. No one wants a soft Brexit -many of the people on either end prefer the opposite end to the middle. People WILL throw a tantrum with their vote over this. What Labour do NOT have to do is back Remain in that #2ndRef.,7,Jul,none,"[labour, promise, 2ndref, wants, soft, brexit, many, people, either, prefer, opposite, middle, people, throw, tantrum, vote, labour, back, remain, 2ndref]",labour promise 2ndref wants soft brexit many people either prefer opposite middle people throw tantrum vote labour back remain 2ndref
7,MySynthDreams,What's this talk about #brexit ? All the bars in Budapest are full of Brits,7,Jul,none,"[talk, brexit, bars, budapest, full, brit]",talk brexit bars budapest full brit
8,JoseSaylor,Ann Widdecombe Says She Has 'No Regrets' Comparing Brexit to Slavery | G... https://youtu.be/WkkQHrepBBA via @YouTube,7,Jul,none,"[widdecombe, says, regrets, comparing, brexit, slavery, youtub]",widdecombe says regrets comparing brexit slavery youtub
...,...,...,...,...,...,...,...
383261,AgataGostynska,For the European Parliament’s tactics in the trade negotiations read this @CER_EU from 2017: https://www.cer.eu/publications/archive/policy-brief/2017/parliamentarians-brexit-talks-bulls-china-shop; #brexit,2,Feb,none,"[european, parliament, tactics, trade, negotiations, read, cer, eu, 2017, brexit]",european parliament tactics trade negotiations read cer eu 2017 brexit
383263,news4321,Man from newspaper that has spent the last three years telling the world Brexit is about Empire nostalgia is annoyed that British commentators have got the Irish election *wrong* #ohtheirony,2,Feb,none,"[newspaper, spent, last, three, years, telling, world, brexit, empire, nostalgia, annoyed, british, commentators, irish, election, wrong, ohtheironi]",newspaper spent last three years telling world brexit empire nostalgia annoyed british commentators irish election wrong ohtheironi
383265,Will83064416,"I didn’t vote mate, I support my club no matter what, and also let’s end this on a statistic. 54% voted for parties who wanted another referendum, I notice how you turn a blind eye to that.",2,Feb,none,"[vote, mate, support, club, matter, also, statistic, voted, parties, wanted, another, referendum, notice, turn, blind]",vote mate support club matter also statistic voted parties wanted another referendum notice turn blind
383267,WilliamHayesWo1,"True that Remainers consider ourselves a bright bunch. But we’ve done nothing to justify the accolade over the past 4 years and the “thick” Brexiteers have run rings round us. Truth is Brexiteers want Brexit at any cost and Remainers never twigged this, never able to respond",2,Feb,none,"[true, remainers, consider, bright, bunch, done, nothing, justify, accolade, past, years, thick, brexiteers, rings, round, truth, brexiteers, want, brexit, cost, remainers, never, twigged, never, able, respond]",true remainers consider bright bunch done nothing justify accolade past years thick brexiteers rings round truth brexiteers want brexit cost remainers never twigged never able respond


In [219]:
#df_mb = df_mb[['MONTH_INT', 'MONTH_STR','USERNAME','TEXT','TWEET_PROCESSED','TWEET_CLEANED','PM']]
#df_may = df_may[['MONTH_INT','MONTH_STR','USERNAME','TEXT','TWEET_PROCESSED','TWEET_CLEANED','PM']]
#df_boris = df_boris[['MONTH_INT','MONTH_STR','USERNAME','TEXT','TWEET_PROCESSED','TWEET_CLEANED','PM']]
cdf = cdf[['MONTH_INT','MONTH_STR','USERNAME','TEXT','TWEET_PROCESSED','TWEET_CLEANED','PM']]

In [221]:
cdf.MONTH_INT.value_counts()

10    36117
5     35382
11    29216
4     25443
6     25370
9     24441
3     24303
1     23038
2     22275
8     20304
7     19963
12    19452
Name: MONTH_INT, dtype: int64

In [223]:
cdf.groupby('PM').MONTH_INT.value_counts()

PM             MONTH_INT
Boris Johnson  10            3254
               11            2376
               9             2170
               7             1973
               8             1513
               3             1430
               12            1379
               6             1316
               1              328
               2              246
               4              216
               5              207
Theresa May    4             3812
               1             2691
               12            1869
               2             1795
               3             1666
               5             1031
               6              320
               7              160
               10             120
               9              108
               11              85
               8               78
none           5            34144
               10           32743
               11           26755
               6            23734
               9       

In [229]:
cdf.to_csv('cdf.csv',index=False)