### Imports

In [1]:
import pandas as pd
import numpy as np

import re

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import glob
import os

from datetime import datetime

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer 
from nltk.stem import WordNetLemmatizer 
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import RegexpTokenizer
from nltk import pos_tag, pos_tag_sents
from nltk.probability import FreqDist
from nltk.tag import StanfordNERTagger
import nltk

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn import preprocessing

from yellowbrick.cluster import KElbowVisualizer

from textblob import TextBlob
from collections import Counter

In [2]:
pd.set_option('max_colwidth', 800)

### Functions

In [3]:
def clean_up(tweet):
    tweet.lower()  #convert the tweet to lower case
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)  #convert urls to string "URL"
    tweet = re.sub('@[^\s]+','AT_USER', tweet) #convert all @username to "AT_USER"
    tweet = re.sub('[\s]+', ' ', tweet)      #correct double white spaces to a single white space
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)   #convert "#topic" to just "topic"
    tweet = re.sub(r'\W*\b\w{1,3}\b', '', tweet)
    tweet = re.sub('[^A-Za-z0-9]+', ' ', tweet)  #remove all ponctuation
    return tweet

def tokenize(s):
    return word_tokenize(s)

def stem_and_lemmatize(l):
    l = ' '.join(l)
    stem = PorterStemmer().stem(l)
    lemm = WordNetLemmatizer().lemmatize(stem)
    return lemm

def remove_stopwords(l):
    stop_words = set(stopwords.words('english')) 
    return [i for i in l.split() if i not in stop_words]

stopwords_dict = {lang: set(nltk.corpus.stopwords.words(lang)) for lang in nltk.corpus.stopwords.fileids()}
#function to detect language based on # of stop words for particular language
def get_language(text):
    words = set(nltk.wordpunct_tokenize(text.lower()))
    lang = max(((lang, len(words & stopwords)) for lang, stopwords in stopwords_dict.items()), key = lambda x: x[1])[0]
    return True if lang == 'english'else False
    
def get_pm(row):
    pms = []
    text = row["text"].lower()
    if "boris" in text or "johnson" in text:
        pms.append("boris")
    elif "theresa" in text:
        pms.append("may")
    else:
        pms.append("none") 
    return ",".join(pms)

### Data

In [5]:
path = r'/Users/ironhack/Documents/GitHub/IronHack/W9FinalProject/final-project/your-project/tweets/2019' # use your path
all_files = glob.glob(os.path.join(path, "*.csv"))
li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160000 entries, 0 to 159999
Data columns (total 15 columns):
username          160000 non-null object
to                99155 non-null object
text              159583 non-null object
retweets          160000 non-null int64
favorites         160000 non-null int64
replies           160000 non-null int64
id                160000 non-null int64
permalink         160000 non-null object
author_id         160000 non-null int64
date              160000 non-null object
formatted_date    160000 non-null object
hashtags          38147 non-null object
mentions          21278 non-null object
geo               0 non-null float64
urls              49326 non-null object
dtypes: float64(1), int64(5), object(9)
memory usage: 18.3+ MB


In [7]:
df.isna().sum()*100/len(df)

username            0.000000
to                 38.028125
text                0.260625
retweets            0.000000
favorites           0.000000
replies             0.000000
id                  0.000000
permalink           0.000000
author_id           0.000000
date                0.000000
formatted_date      0.000000
hashtags           76.158125
mentions           86.701250
geo               100.000000
urls               69.171250
dtype: float64

In [8]:
df.columns

Index(['username', 'to', 'text', 'retweets', 'favorites', 'replies', 'id',
       'permalink', 'author_id', 'date', 'formatted_date', 'hashtags',
       'mentions', 'geo', 'urls'],
      dtype='object')

In [9]:
df = df[['date','id', 'username', 'text']]

In [10]:
df.head()

Unnamed: 0,date,id,username,text
0,2019-07-30 23:59:58+00:00,1156353776049111040,janekin24,Not about Brexit for Boris. I see his strategy. It’s all about killing off TBP. Smokescreen of WTO hard talk.Then he will give us Brino. Well Twitter friends. Are we going to let him get away with this or will we take TBP all the way to government? We are not going into that box!
1,2019-07-30 23:59:58+00:00,1156353774262390785,brexit_clock,"WOW - Another Brexit extension - time now until 31st October 2019 : 92 days, 21 hours, 59 minutes and 59 seconds #BREXIT #BREXITCLOCK #CLOCK #EU #EUREF #LEAVE"
2,2019-07-30 23:59:50+00:00,1156353739437019136,JuanWild51,Brexit: Can Anyone Take the Wheel From Johnson?..#Bojo..#Brexit..#Britain..
3,2019-07-30 23:59:45+00:00,1156353717987368960,DGAll41,"Brexit minister Stephen Barclay tells Michel Barnier that UK will leave EU on October 31 with or without a deal - Evening Standard. Amazing Gove has just been bollocked for similar terminology and less than 24 hrs later, dick Barclay blunders in."
4,2019-07-30 23:59:44+00:00,1156353717232459776,BioMickWatson,This isn't a superhero movie where the bad guy gets to be a good guy because they share common cause. Brexit and Scottish indy *are* the common cause we need to fight against


In [11]:
df.dropna(inplace=True)

In [12]:
df.dtypes

date        object
id           int64
username    object
text        object
dtype: object

In [13]:
duplicateusername = df[df.duplicated(['username'])]
duplicateusername

Unnamed: 0,date,id,username,text
15,2019-07-30 23:59:06+00:00,1156353554929598465,noajida1,"1 El Brexit sense acord que Boris Johnson porta en la seva agenda comença a tenir efectes a les borses. Aquest dimarts, mentre prometia prosperitat als grangers del País de Gal·les, els mercats li portaven la contrària."
23,2019-07-30 23:58:30+00:00,1156353405541072896,JoseSaylor,"BrexBox Episode 3: May takes swipe at the Brexit Party &amp; the great, big ... https://youtu.be/d4VF_jAsHKQ via @YouTube"
37,2019-07-30 23:57:37+00:00,1156353184807305221,Fabiolucv,Brexit minister Barclay tells Barnier UK will leave EU on Oct 31 with or without deal http://dlvr.it/R9NC7S
40,2019-07-30 23:57:20+00:00,1156353112006938630,DGAll41,"Britain searches for post-Brexit trading opportunities in South East Asia - Reuters. Raab is a deluded dick that hasn’t got a clue about the logistics or procedures, he didn’t even realise the scale that Calais and Dover play in imports and exports."
47,2019-07-30 23:56:59+00:00,1156353025306431488,noajida1,El Brexit dur anunciat per Boris Johnson fa tremolar els mercats i enfonsa l'Ibex
...,...,...,...,...
159987,2020-02-12 18:55:42+00:00,1227667613452558339,StrongbowsPub,Well it won’t. The part of the story that was not speculation is what the Germans emphasized. No cherry picking. That’s agreed in the EU. The rest is opening Gambits In the budget negotiations. Enjoy your Brexit.
159990,2020-02-12 18:55:28+00:00,1227667556112257026,abcpoppins,"Benedikt Franke, who leads Munich Security Conference, tweeted his disappointment. “UK remains a key pillar of European security and we strongly believe that we need to engage even more now that Brexit is done,” he said, adding #uklessness."
159996,2020-02-12 18:55:03+00:00,1227667447395864577,Will83064416,"I didn’t vote mate, I support my club no matter what, and also let’s end this on a statistic. 54% voted for parties who wanted another referendum, I notice how you turn a blind eye to that."
159997,2020-02-12 18:54:57+00:00,1227667425610563584,movarsi,"Twelve Days After Brexit, U.K. Falls Foul of EU Rules"


In [14]:
df.drop('id', inplace=True, axis=1)

### Fixing time type column

In [15]:
df['date'] = pd.to_datetime(df['date'])
df['date'] = [d.date() for d in df['date']]
df['month'] = pd.DatetimeIndex(df['date']).month

### Removing non english tweets

In [16]:
df['language'] = df['text'].apply(get_language)
df = df[(df['language'] == True)]

In [17]:
df.drop('language', inplace=True, axis=1)

In [18]:
df

Unnamed: 0,date,username,text,month
0,2019-07-30,janekin24,Not about Brexit for Boris. I see his strategy. It’s all about killing off TBP. Smokescreen of WTO hard talk.Then he will give us Brino. Well Twitter friends. Are we going to let him get away with this or will we take TBP all the way to government? We are not going into that box!,7
1,2019-07-30,brexit_clock,"WOW - Another Brexit extension - time now until 31st October 2019 : 92 days, 21 hours, 59 minutes and 59 seconds #BREXIT #BREXITCLOCK #CLOCK #EU #EUREF #LEAVE",7
2,2019-07-30,JuanWild51,Brexit: Can Anyone Take the Wheel From Johnson?..#Bojo..#Brexit..#Britain..,7
3,2019-07-30,DGAll41,"Brexit minister Stephen Barclay tells Michel Barnier that UK will leave EU on October 31 with or without a deal - Evening Standard. Amazing Gove has just been bollocked for similar terminology and less than 24 hrs later, dick Barclay blunders in.",7
4,2019-07-30,BioMickWatson,This isn't a superhero movie where the bad guy gets to be a good guy because they share common cause. Brexit and Scottish indy *are* the common cause we need to fight against,7
...,...,...,...,...
159993,2020-02-12,JulieBishenden,We Brexiteers must stay on the battlefield to stiffen Boris' resolve in fighting the EU #Brexit #EU #MondayMotivaton https://www.telegraph.co.uk/politics/2020/02/12/brexiteers-must-stay-battlefield-stiffen-boris-resolve-fighting/?WT.mc_id=tmg_share_tw via @Telegraph,2
159994,2020-02-12,news4321,Man from newspaper that has spent the last three years telling the world Brexit is about Empire nostalgia is annoyed that British commentators have got the Irish election *wrong* #ohtheirony,2
159996,2020-02-12,Will83064416,"I didn’t vote mate, I support my club no matter what, and also let’s end this on a statistic. 54% voted for parties who wanted another referendum, I notice how you turn a blind eye to that.",2
159998,2020-02-12,WilliamHayesWo1,"True that Remainers consider ourselves a bright bunch. But we’ve done nothing to justify the accolade over the past 4 years and the “thick” Brexiteers have run rings round us. Truth is Brexiteers want Brexit at any cost and Remainers never twigged this, never able to respond",2


### Creating column for Theresa May/Boris Jonhson - selecting tweets that only refer the PMs

In [19]:
sentence = u"Twenty miles east of Reno, Nev., " \
    "where packs of wild mustangs roam free through " \
    "the parched landscape, Tesla Gigafactory 1 " \
    "sprawls near Interstate 80."

jar = '/Users/ironhack/Documents/english.all.3class.distsim.crf.ser.gz'
model = '/Users/ironhack/Documents/stanford-ner.jar'

# Prepare NER tagger with english model
ner_tagger = StanfordNERTagger(model, jar, encoding='utf8')

# Tokenize: Split sentence into words
words = nltk.word_tokenize(sentence)

# Run NER tagger on words
print(ner_tagger.tag(words))

No Java runtime present, requesting install.



OSError: Java command failed : ['/usr/bin/java', '-mx1000m', '-cp', '/Users/ironhack/Documents/english.all.3class.distsim.crf.ser.gz', 'edu.stanford.nlp.ie.crf.CRFClassifier', '-loadClassifier', '/Users/ironhack/Documents/stanford-ner.jar', '-textFile', '/var/folders/_y/9zk4zxk116l2xwsc8gsvjn3m0000gn/T/tmpcx_i7wgo', '-outputFormat', 'slashTags', '-tokenizerFactory', 'edu.stanford.nlp.process.WhitespaceTokenizer', '-tokenizerOptions', '"tokenizeNLs=false"', '-encoding', 'utf8']

In [20]:
df["pm"] = df.apply(get_pm,axis=1)

In [21]:
#le = preprocessing.LabelEncoder()
#df["pm_label"] = le.fit_transform(df.pm.values)

In [22]:
df["pm"].value_counts()

none     121178
boris     12880
may        7652
Name: pm, dtype: int64

In [23]:
may_df = df[df['pm'] == 'may']
boris_df = df[df['pm'] == 'boris']
df_mb = may_df.append(boris_df)

In [36]:
mergedStuff = pd.merge(df, df_mb, on=['username'], how='inner')
mergedStuff.head()

Unnamed: 0,date_x,username,text_x,month_x,pm_x,date_y,text_y,month_y,pm_y
0,2019-07-30,janekin24,Not about Brexit for Boris. I see his strategy. It’s all about killing off TBP. Smokescreen of WTO hard talk.Then he will give us Brino. Well Twitter friends. Are we going to let him get away with this or will we take TBP all the way to government? We are not going into that box!,7,boris,2019-04-07,"#BrexitBetrayal Oh we know what is 'happening with Brexit.' You Theresa can say words that mean the opposite, try and obfuscate but we see you. You have done all you can to stop Brexit whilst saying you are trying to deliver it. No point in saying any more to us. Just go.",4,may
1,2019-07-30,janekin24,Not about Brexit for Boris. I see his strategy. It’s all about killing off TBP. Smokescreen of WTO hard talk.Then he will give us Brino. Well Twitter friends. Are we going to let him get away with this or will we take TBP all the way to government? We are not going into that box!,7,boris,2019-04-30,"Theresa May preparing to cave in to Labour demands on Brexit, Eurosceptics fear https://www.telegraph.co.uk/politics/2019/04/30/theresa-may-preparing-cave-labour-demands-brexit-eurosceptics/?WT.mc_id=tmg_share_tw via @Telegraph",4,may
2,2019-07-30,janekin24,Not about Brexit for Boris. I see his strategy. It’s all about killing off TBP. Smokescreen of WTO hard talk.Then he will give us Brino. Well Twitter friends. Are we going to let him get away with this or will we take TBP all the way to government? We are not going into that box!,7,boris,2019-07-30,Not about Brexit for Boris. I see his strategy. It’s all about killing off TBP. Smokescreen of WTO hard talk.Then he will give us Brino. Well Twitter friends. Are we going to let him get away with this or will we take TBP all the way to government? We are not going into that box!,7,boris
3,2019-04-07,janekin24,"#BrexitBetrayal Oh we know what is 'happening with Brexit.' You Theresa can say words that mean the opposite, try and obfuscate but we see you. You have done all you can to stop Brexit whilst saying you are trying to deliver it. No point in saying any more to us. Just go.",4,may,2019-04-07,"#BrexitBetrayal Oh we know what is 'happening with Brexit.' You Theresa can say words that mean the opposite, try and obfuscate but we see you. You have done all you can to stop Brexit whilst saying you are trying to deliver it. No point in saying any more to us. Just go.",4,may
4,2019-04-07,janekin24,"#BrexitBetrayal Oh we know what is 'happening with Brexit.' You Theresa can say words that mean the opposite, try and obfuscate but we see you. You have done all you can to stop Brexit whilst saying you are trying to deliver it. No point in saying any more to us. Just go.",4,may,2019-04-30,"Theresa May preparing to cave in to Labour demands on Brexit, Eurosceptics fear https://www.telegraph.co.uk/politics/2019/04/30/theresa-may-preparing-cave-labour-demands-brexit-eurosceptics/?WT.mc_id=tmg_share_tw via @Telegraph",4,may


### Cleaning the tweets

In [None]:
df_mb['text_wturls'] = df_mb['text'].apply(clean_up)

In [None]:
df_mb['text_processed'] = df_mb['text'].apply(clean_up).apply(tokenize).apply(stem_and_lemmatize)\
                       .apply(remove_stopwords)

In [None]:
df_mb['clean_text'] = [' '.join(map(str, l)) for l in df_mb['text_processed']]

### Making tags for May

In [None]:
df_mb['POSTags'] = pos_tag_sents(df_mb['clean_text'].apply(word_tokenize).tolist())

In [None]:
df_mb = df_mb[['date', 'month','text','text_wturls','text_processed','clean_text','pm', 'POSTags']]

In [None]:
df_mb

### Checking top words

In [None]:
cv = CountVectorizer(stop_words = 'english')
words = cv.fit_transform(df_mb['clean_text'])

sum_words = words.sum(axis=0)

words_freq = [(word, sum_words[0, i]) for word, i in cv.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)

frequency = pd.DataFrame(words_freq, columns=['word', 'freq'])
frequency.head(50).plot(x='word', y='freq', kind='bar', figsize=(15, 7), color = 'blue')
plt.title("top 50 frequent words")

In [None]:
frequency.head(50)

## K-means

### Checking optimal number of clusters

In [None]:
model = KMeans()
visualizer = KElbowVisualizer(model, k=(2,17))
visualizer.fit(desc_matrix)
visualizer.poof()

### Preparing & Implementing K-Means

In [None]:
# vectorizing with CountVectorizer
tf_idf_vect = CountVectorizer(analyzer='word',ngram_range=(1,1),stop_words='english', min_df = 0.0001)
tf_idf_vect.fit(df_mb["clean_text"])
desc_matrix = tf_idf_vect.transform(df_mb["clean_text"])

In [None]:
# implementing kmeans
km = KMeans(n_clusters = 3, n_init = 10, n_jobs = -1)
km.fit(desc_matrix)
clusters = km.labels_.tolist()

In [None]:
df_mb['labels'] = clusters

In [None]:
# vectorizing with TfidfVectorizer
tweets = [tweet for tweet in df_mb['clean_text']]
tfidf_vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1,3))  
tfidf_matrix = tfidf_vectorizer.fit_transform(tweets)  
feature_names = tfidf_vectorizer.get_feature_names()  

In [None]:
km = KMeans(n_clusters = 3, n_init = 10, n_jobs = -1)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

In [None]:
df_mb['labels_tfid'] = clusters

In [None]:
df_mb[df_mb['pm'] == 'may']['labels'].value_counts()

In [None]:
df_mb[df_mb['pm'] == 'boris']['labels_tfid'].value_counts()

In [None]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
for i in range(3):
    print("Cluster {}: Words:".format(i))
    for ind in order_centroids[i, :10]: 
        print('    %s' % feature_names[ind])

### Checking results

#### 3 iterations

In [None]:
df_mb[df_mb['pm'] == 'may']['labels'].value_counts()

In [None]:
df_mb[df_mb['pm'] == 'boris']['labels'].value_counts()

#### 10 iterations

In [None]:
df_mb[df_mb['pm'] == 'may']['labels'].value_counts()

In [None]:
df_mb[df_mb['pm'] == 'boris']['labels'].value_counts()

## Evaluating model

In [None]:
preds = cluster.fit_predict(features)
centers = cluster.cluster_centers_

score = silhouette_score (features, preds, metric='euclidean')
print (f"For n_clusters = {n_clusters}, silhouette score is {score})")

## try to plot to see what it looks like

## Sentiment Analysis

### NLKT

In [None]:
#with nlkt sentiment analysys
sid = SentimentIntensityAnalyzer()

df_mb['sentiment_compound_polarity'] = df_mb['clean_text'].apply(lambda x: sid.polarity_scores(x)['compound'])
df_mb['sentiment_neutral'] = df_mb['clean_text'].apply(lambda x: sid.polarity_scores(x)['neu'])
df_mb['sentiment_negative'] = df_mb['clean_text'].apply(lambda x: sid.polarity_scores(x)['neg'])
df_mb['sentiment_pos'] = df_mb['clean_text'].apply(lambda x:sid.polarity_scores(x)['pos'])

df_mb.loc[df_mb.sentiment_compound_polarity > 0,'sentiment'] = 'positive'
df_mb.loc[df_mb.sentiment_compound_polarity == 0,'sentiment'] = 'neutral'
df_mb.loc[df_mb.sentiment_compound_polarity < 0,'sentiment'] = 'negative'
df_mb.head()

In [None]:
#df_mb.drop(['sentiment_polarity', 'sentiment_type','sentiment_neutral', 'sentiment_negative', 'sentiment_pos', 'sentiment_compound_polarity'], axis=1, inplace=True)

In [None]:
df_mb.groupby('pm')['sentiment'].value_counts()

### Textblob

In [None]:
#polarity = lambda x: TextBlob(x).sentiment.polarity
#subjectivity = lambda x: TextBlob(x).sentiment.subjectivity
#df_mb['polarity_blob'] = df_mb['clean_text'].apply(polarity)
#df_mb['subjectivity_blob'] = df_mb['clean_text'].apply(subjectivity)

#df_mb.loc[df_mb.polarity_blob >= 1,'sentiment_blob'] = 'positive'
#df_mb.loc[df_mb.polarity_blob == 0,'sentiment_blob'] = 'neutral'
#df_mb.loc[df_mb.polarity_blob <= 1,'sentiment_blob'] = 'negative'

#df_mb.head()

In [None]:
#with textblob sentiment analysys
sentiments_textblob = df_mb['clean_text'].apply(lambda tweet: sentiment_textblob(tweet))
pd.DataFrame(sentiments_textblob.value_counts())

In [None]:
sentiments_textblob = df_mb['clean_text'].apply(lambda tweet: sid.polarity_scores(tweet))
pd.DataFrame(sentiments_textblob.value_counts())

In [None]:
df_mb['sentiment_n'] = df_mb['sentiment'].apply(
      lambda x: 2 if x == 'positive' else (0 if x == 'negative' else 1))

In [None]:
plt.figure(figsize=(20,5)) # Figure size
sns.lineplot(x='month', y='sentiment_n', data=df_mb, hue='pm', marker='o')
plt.title('Sentiment Analysis on Tweets 2019')
plt.xticks(df_mb.month.unique(), rotation=90) 
plt.show()

In [None]:
df_mb