### Imports

In [None]:
import pandas as pd
import numpy as np

import re
import string

import matplotlib.pyplot as plt
import seaborn as sns

import glob
import os

from datetime import datetime

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn import preprocessing

from textblob import TextBlob

In [None]:
pd.set_option('max_colwidth', 800)

### Functions

In [None]:
def clean_up(s):
    words = s.split()
    words = [w.lower() for w in words if not w.startswith('http://')]
    words = re.findall('[^\d\W]+', ' '.join(words))
    return ' '.join(words)

def tokenize(s):
    return word_tokenize(s)

def stem_and_lemmatize(l):
    l = ' '.join(l)
    stem = PorterStemmer().stem(l)
    lemm = WordNetLemmatizer().lemmatize(stem)
    return lemm

def remove_stopwords(l):
    stop_words = stopwords.words('english')
    return [i for i in l.split() if i not in stop_words]

def get_pm(row):
    pms = []
    text = row["text"].lower()
    if "boris" in text or "johnson" in text:
        pms.append("boris")
    if "theresa" in text:
        pms.append("may")
    else:
        pms.append("none") 
    return ",".join(pms)

def sentiment_nlkt(text):
    sid = SentimentIntensityAnalyzer()
    polarity_scores = sid.polarity_scores(text)
    return 'neg' if polarity_scores['neg'] > polarity_scores['pos'] else 'pos'

def sentiment_textblob(text):
    analysis = TextBlob(text)
    return 'pos' if analysis.sentiment.polarity >= 0 else 'neg'

### Data

In [None]:
path = r'/Users/ironhack/Documents/GitHub/IronHack/W9FinalProject/final-project/your-project/tweets/' # use your path
all_files = glob.glob(os.path.join(path, "*.csv"))
li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True)

In [None]:
df.info()

In [None]:
frame = frame[['timestamp','tweet_id','tweet_text']]
frame = frame.rename(columns={'timestamp': 'date', 'tweet_id': 'id', 'tweet_text': 'text'})

In [None]:
df.isna().sum()*100/len(df161)

In [None]:
df.dropna(inplace=True)

In [None]:
df.columns

In [None]:
df = df[['date','id', 'username', 'text']]

### Fixing time type column

In [None]:
df['date'] = pd.to_datetime(result['date'])
df['date'] = [d.date() for d in result['date']]

### Creating column for Theresa May/Boris Jonhson

In [None]:
df["pm"] = df.apply(get_pm,axis=1)

In [None]:
le = preprocessing.LabelEncoder()
df["pm_label"] = le.fit_transform(df.pm.values)

In [None]:
df["pm"].value_counts()

In [None]:
df[df['pm'] == 'may']

### Cleaning the tweets

In [None]:
df['text_processed'] = df['text'].apply(clean_up).apply(tokenize).apply(stem_and_lemmatize)\
                       .apply(remove_stopwords)

### Checking top words

In [None]:
cv = CountVectorizer(stop_words = 'english')
words = cv.fit_transform(df['text'])

#df161['text'].apply(lambda x: cv.fit_transform(x))

sum_words = words.sum(axis=0)

words_freq = [(word, sum_words[0, i]) for word, i in cv.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)

frequency = pd.DataFrame(words_freq, columns=['word', 'freq'])

frequency.head(30).plot(x='word', y='freq', kind='bar', figsize=(15, 7), color = 'blue')
plt.title("Most Frequently Occuring Words - Top 30")

In [None]:
countVectorizer = CountVectorizer(analyzer=clean_text) 
countVector = countVectorizer.fit_transform(df['text'])
print('{} Number of tweets has {} words'.format(countVector.shape[0], countVector.shape[1]))

### Creating bag of words

In [None]:
total_words = [w for words in df['text_processed'] for w in words if len(w) > 1]
bow = {k: total_words.count(k) for k in total_words}
sorted_bow = sorted(bow.items(), key=lambda kv: kv[1], reverse=True)

In [None]:
sb = {k: v for k,v in sorted_bow[:50]}
words = pd.DataFrame(sb, index=['values'])
words

In [None]:
kmeans = KMeans(n_clusters = 3, n_init = 20, n_jobs = 1) # n_init(number of iterations for clsutering) n_jobs(number of cpu cores to use)
kmeans.fit(df161['text'])
# We look at 3 the clusters generated by k-means.
common_words = kmeans.cluster_centers_.argsort()[:,-1:-26:-1]
for num, centroid in enumerate(common_words):
    print(str(num) + ' : ' + ', '.join(words[word] for word in centroid))
    
    

### Checking sentiment 

In [None]:
#with nlkt sentiment analysys
sentiments_nlkt = df['processed_text'].apply(lambda tweet: sentiment_nlkt(tweet))
pd.DataFrame(sentiments_nlkt.value_counts())

In [None]:
#with textblob sentiment analysys
sentiments_textblob = df['processed_text'].apply(lambda tweet: sentiment_textblob(tweet))
pd.DataFrame(sentiments_textblob.value_counts())