In [1]:
from joblib import dump, load
import pandas as pd
import matplotlib.pyplot as plt
import re
from dateutil.parser import parse
from IPython.display import display, HTML
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from nltk.corpus import stopwords 
from nltk import ngrams
from collections import Counter
import numpy as np
import string as string_
from multiprocessing import Pool, cpu_count
from tqdm import tqdm
import tempfile
import warnings

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', -1)
display(HTML("<style>.container { width:95% !important; }</style>"))

# Data Extraction

In [2]:
df = pd.read_csv(filepath_or_buffer = "training.1600000.processed.noemoticon.csv", 
                 encoding = "ISO-8859-1", 
                 names = ["sentiment", "ids", "date", "flag", "user", "text"])

In [3]:
df.shape

(1600000, 6)

In [4]:
df.head()

Unnamed: 0,sentiment,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."


In [5]:
df.tail()

Unnamed: 0,sentiment,ids,date,flag,user,text
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best feeling ever
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interviews! â« http://blip.fm/~8bmta
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me for details
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! Tupac Amaru Shakur
1599999,4,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity @SpeakingUpH4H


# Initial Data Cleaning

#### Change labels to binary format

In [None]:
df.sentiment.replace({4:1}, inplace=True)

In [None]:
# 1 = positive
# 0 = negative

df.sentiment.value_counts()

#### Extract Dates for analysis

In [None]:
# date functions
month_map = {
    "Jan":"01",
    "Feb":"02",
    "Mar":"03",
    "Apr":"04",
    "May":"05",
    "Jun":"06",
    "Jul":"07",
    "Aug":"08",
    "Sep":"09",
    "Oct":"10",
    "Nov":"11",
    "Dec":"12",
}

def extract_year(string):
    return re.search('(\d{4})', string).group()

def extract_time(string):
    return re.search('(?:[01]\d|2[0123]):(?:[012345]\d):(?:[012345]\d)', string).group()

def extract_day(string):
    return string.split()[0]

def extract_date(string, month_map = month_map):    
    year = extract_year(string)
    date = string.split()[:3]
    return year + '-' + month_map[date[1]] + '-' + date[2]

def extract_datetime(string, month_map = month_map):
    date = extract_date(string)
    time = extract_time(string)
    return date + " " + time

In [None]:
df.rename(index=str, columns={"date":"date_old"}, inplace=True)

df['datetime'] = df.date_old.apply(lambda x: extract_datetime(x))
df['time'] = df.date_old.apply(lambda x: extract_time(x))
df['date'] = df.date_old.apply(lambda x: extract_date(x))
df['day']  = df.date_old.apply(lambda x: extract_day(x))

df.drop(['date_old', 'flag', 'ids'], axis=1, inplace=True)

In [None]:
df.head()

# Data Exploration

#### At first glance, with uncleaned raw text, we have a vocabulary of over 1.19 million unique words, although this includes strings that will later be removed, including usernames, urls, words attached to punctuation and so on.  

In [None]:
len(set(df['text'].str.cat(sep=' ').lower().split()))

#### The average length of each tweet is around 13 words

In [None]:
sum([len(sentence.split()) for sentence in df.text]) / df.shape[0]

#### Tweets are between  April 6, 2009 to June 25, 2009.

In [None]:
df.datetime.min(), df.datetime.max()

#### We can split the data into each sentiment group, to see if there are any noticable distinguishments that could help us in the later stages of feature selection, engineering and cleaning.

In [None]:
df_pos = df[df.sentiment==1]
df_neg = df[df.sentiment==0]

#### Looking and the number of tweets for both sentiment classes for each day of the week, we can see that positive tweets seem to occur less on wednesdays and thursdays, and more on a mondays and sundays. Negative tweets seem more evenly distributed, with slight increases across the weekends. However, this could be caused by the methods of data collection.

In [None]:
index=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
pd.crosstab(df['day'],df['sentiment']).reindex(index).plot.bar(title='Sentiment counts each day', figsize=(15, 10))

#### Looking at the time each sentiment class occurs does not provide much valuable information.

In [None]:
df_pos.groupby('time').sentiment.count().plot(figsize=(25, 10), title='Tweets with positive sentiment by time')

In [None]:
df_neg.groupby('time').sentiment.count().plot(figsize=(25, 10), title='Tweets with negative sentiment by time')

In [None]:
def get_wordcloud(text, custom_stopwords = None):
    if custom_stopwords: custom_stopwords = list(STOPWORDS) + custom_stopwords
    wordcloud = WordCloud(
        width = 3000,
        height = 2000,
        background_color = 'black',
        stopwords = custom_stopwords if custom_stopwords else STOPWORDS).generate(str(text))
    fig = plt.figure(
        figsize = (30, 20),
        facecolor = 'k',
        edgecolor = 'k')
    plt.imshow(wordcloud, interpolation = 'bilinear')
    plt.axis('off')
    plt.tight_layout(pad=0)
    plt.show()

####  Wordclouds for both classes (pre-cleaning) give some valuable insight. Words such as 'congrats', 'thanks', 'best' and 'love' appear in the positive sentiment tweets...

In [None]:
get_wordcloud(df_pos.text)

#### ...and words from the negative sentiment tweets' wordcloud include 'RIP', 'Sad' and 'ugh'

In [None]:
get_wordcloud(df_neg.text)

# Text processing

In [None]:

def clean_text(raw_string):

    cleaned_string = raw_string.lower() # lower case
    cleaned_string = re.sub(r"http\S+", "", cleaned_string) # remove url
    cleaned_string = re.sub('@[^\s]+','', cleaned_string) # remove usernames
    cleaned_string = re.sub(r'#([^\s]+)', r'\1', cleaned_string) # remove # from hashtag
    cleaned_string = cleaned_string.translate(str.maketrans('', '', string_.punctuation)) # remove punctuation
    cleaned_string =  re.sub(' +', ' ', cleaned_string.strip()) # remove trailing and double whitespace
    cleaned_string = ' '.join(word for word in cleaned_string.split() if word not in stopwords)
    
    return cleaned_string

##### Apply text cleaning function to dataframe and remove rows with less than 1 word remaining 

In [None]:
df['clean_text'] = df.text.apply(lambda x: clean_text(x))
df = df[df.clean_text.str.split().str.len() > 1]
df.reset_index(drop=True)

## Modelling with TFIDF Vectorizer 

#### Split the data into its training and testing subsets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.clean_text, df.sentiment, test_size = 0.2, random_state = 0)
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

#### Create a simple pipeline for to vectorize the text and classify using logistic regression

In [None]:
tfidf_args = {
    'ngram_range': (1, 2),
    'use_idf': True,
    'sublinear_tf': False,
}

pipe_logit = Pipeline([
    ('vectorizer', TfidfVectorizer(**tfidf_args)),
    ('classifier', LogisticRegression())
], memory=tempfile.gettempdir())


param_grid_logit = {
    'classifier__penalty': ['l1', 'l2'],
    'classifier__C':[.1, 1, 10]
}
    
gs_logit = GridSearchCV(pipe_logit, param_grid_logit, n_jobs=-1, cv=5, verbose=3, scoring='accuracy')
best_logit_pipe = gs_logit.fit(X_train, y_train)

In [None]:
dump(best_logit_pipe, 'logit_pipe.sav') 
best_logit_pipe = load('logit_pipe.sav') 
best_logit_pipe.best_params_

In [None]:
print(classification_report(y_test, best_logit_pipe.predict(X_test)))

#### Create a simple pipeline for to vectorize the text and classify using a linear support vector classifier

In [None]:
pipe_svc = Pipeline([
    ('vectorizer', TfidfVectorizer(**tfidf_args)),
    ('classifier', LinearSVC())
], memory=tempfile.gettempdir())


param_grid_svc = {
    'classifier__penalty': ['l1', 'l2'],
    'classifier__C':[.1, 1, 10]
}
    
gs_svc = GridSearchCV(pipe_svc, param_grid_svc, n_jobs=-1, cv=5, verbose=3, scoring='accuracy')
best_svc_pipe = gs_svc.fit(X_train, y_train)

In [None]:
dump(best_svc_pipe, 'logit_pipe.sav') 
best_svc_pipe = load('logit_pipe.sav') 
best_svc_pipe.best_params_

In [None]:
print(classification_report(y_test, best_svc_pipe.predict(X_test)))

# Deep Learning

In [None]:
print(df['clean_text'].apply(lambda x: len(x.split(' '))).sum())

# Evaluation