In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from mlxtend.plotting import plot_confusion_matrix
import matplotlib.cm as cm
from matplotlib import rcParams
from collections import Counter
from nltk.tokenize import RegexpTokenizer
import re 
import string
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, TextVectorization
from keras.models import Model
from keras.optimizers import RMSprop
from keras.preprocessing import sequence
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [6]:
data = pd.read_csv('training.1600000.processed.noemoticon.csv',
                   encoding='ISO-8859-1', engine='python')
data.columns = ['label', 'time', 'date', 'query', 'username', 'text']
data.head()

Unnamed: 0,label,time,date,query,username,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [7]:
data.tail()

Unnamed: 0,label,time,date,query,username,text
1599994,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599995,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599996,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599997,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
1599998,4,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


In [8]:
len(data)

1599999

In [9]:
data.shape

(1599999, 6)

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599999 entries, 0 to 1599998
Data columns (total 6 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   label     1599999 non-null  int64 
 1   time      1599999 non-null  int64 
 2   date      1599999 non-null  object
 3   query     1599999 non-null  object
 4   username  1599999 non-null  object
 5   text      1599999 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [11]:
data.dtypes

label        int64
time         int64
date        object
query       object
username    object
text        object
dtype: object

In [12]:
np.sum(data.isnull().any(axis=1))

np.int64(0)

In [13]:
data = data[['text', 'label']]

In [14]:
data['label'][data['label'] == 4] = 1

In [15]:
data_pos = data[data['label'] == 1]
data_neg = data[data['label'] == 0]

In [16]:
data_pos = data_pos.iloc[:int(20000)]
data_neg = data_neg.iloc[:int(20000)]

In [17]:
data = pd.concat([data_pos, data_neg])

In [21]:
data['text'] = data['text'].str.lower()

In [26]:
data.head()

Unnamed: 0,text,label
799999,love @health4uandpets u guys r best!!,1
800000,im meeting one besties tonight! cant wait!! - ...,1
800001,"@darealsunisakim thanks twitter add, sunisa! g...",1
800002,sick really cheap hurts much eat real food plu...,1
800003,@lovesbrooklyn2 effect everyone,1


In [23]:
stopwords_list = stopwords.words('english')

In [24]:
", ".join(stopwords.words('english'))

"a, about, above, after, again, against, ain, all, am, an, and, any, are, aren, aren't, as, at, be, because, been, before, being, below, between, both, but, by, can, couldn, couldn't, d, did, didn, didn't, do, does, doesn, doesn't, doing, don, don't, down, during, each, few, for, from, further, had, hadn, hadn't, has, hasn, hasn't, have, haven, haven't, having, he, he'd, he'll, her, here, hers, herself, he's, him, himself, his, how, i, i'd, if, i'll, i'm, in, into, is, isn, isn't, it, it'd, it'll, it's, its, itself, i've, just, ll, m, ma, me, mightn, mightn't, more, most, mustn, mustn't, my, myself, needn, needn't, no, nor, not, now, o, of, off, on, once, only, or, other, our, ours, ourselves, out, over, own, re, s, same, shan, shan't, she, she'd, she'll, she's, should, shouldn, shouldn't, should've, so, some, such, t, than, that, that'll, the, their, theirs, them, themselves, then, there, these, they, they'd, they'll, they're, they've, this, those, through, to, too, under, until, up, 

In [29]:
STOPWORDS = set(stopwords.words('english'))
def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
data['text'] = data['text'].apply(lambda text: cleaning_stopwords(text))
data['text'].head()

799999                love @health4uandpets u guys r best!!
800000    im meeting one besties tonight! cant wait!! - ...
800001    @darealsunisakim thanks twitter add, sunisa! g...
800002    sick really cheap hurts much eat real food plu...
800003                      @lovesbrooklyn2 effect everyone
Name: text, dtype: object

In [30]:
english_punctuations = string.punctuation
punctuation_list = english_punctuations
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuation_list)
    return text.translate(translator)

In [31]:
data['text'] = data['text'].apply(lambda x: cleaning_punctuations(x))
data['text'].tail()

19995                                     one day holidays
19996                     feeling right  hate damn humprey
19997    geezi hv read whole book personality types emb...
19998     threw sign donnie bent get thingee made sad face
19999    heather2711 good thing find none ones like com...
Name: text, dtype: object

In [32]:
def cleaning_repeating_char(text):
    return re.sub(r'(.)\1+','r\1', text)

In [33]:
data['text'] = data['text'].apply(lambda x: cleaning_repeating_char(x))
data['text'].tail()

19995                                     one day holidays
19996                     frling rightrhate damn humprey
19997    grzi hv read whole brk personality types emb...
19998     threw sign dorie bent get thingr made sad face
19999    heather27r grd thing find none ones like com...
Name: text, dtype: object

In [34]:
def cleaning_email(data):
    return re.sub('@[^\s]+', ' ', data)

In [35]:
data['text'] = data['text'].apply(lambda x: cleaning_email(x))
data['text'].tail()

19995                                     one day holidays
19996                     frling rightrhate damn humprey
19997    grzi hv read whole brk personality types emb...
19998     threw sign dorie bent get thingr made sad face
19999    heather27r grd thing find none ones like com...
Name: text, dtype: object

In [36]:
def cleaning_URLs(data):
    return re.sub('((www\.[^\s]+)|(https?://[^\s]))', ' ', data)

In [37]:
data['text'] = data['text'].apply(lambda x: cleaning_URLs(x))
data['text'].tail()

19995                                     one day holidays
19996                     frling rightrhate damn humprey
19997    grzi hv read whole brk personality types emb...
19998     threw sign dorie bent get thingr made sad face
19999    heather27r grd thing find none ones like com...
Name: text, dtype: object

In [38]:
def cleaning_numbers(data):
    return re.sub('[0-9]+', '', data)

In [40]:
data['text'] = data['text'].apply(lambda x: cleaning_numbers(x))
data['text'].tail()

19995                                     one day holidays
19996                     frling rightrhate damn humprey
19997    grzi hv read whole brk personality types emb...
19998     threw sign dorie bent get thingr made sad face
19999    heatherr grd thing find none ones like come ...
Name: text, dtype: object

In [41]:
tokenizer = RegexpTokenizer(r'\w+')
data['text'] = data['text'].apply(tokenizer.tokenize)

In [42]:
data['text'].head()

799999             [love, healthuandpets, u, guys, r, best]
800000    [im, mr, ting, one, besties, tonight, cant, wa...
800001    [darealsunisakim, thanks, twir, er, ar, sunisa...
800002    [sick, rear, y, cheap, hurts, much, eat, real,...
800003                  [lovesbrr, klyn, er, ect, everyone]
Name: text, dtype: object

In [49]:
st = nltk.PorterStemmer()
def stemming_on_text(data):
    text = [st.stem(word) for word in data]
    return data

data['text'] = data['text'].apply(lambda x: stemming_on_text(x))

In [50]:
data.head()

Unnamed: 0,text,label
799999,"[love, healthuandpet, u, guy, r, best]",1
800000,"[im, mr, ting, one, besti, tonight, cant, wait...",1
800001,"[darealsunisakim, thank, twir, er, ar, sunisa,...",1
800002,"[sick, rear, y, cheap, hurt, much, eat, real, ...",1
800003,"[lovesbrr, klyn, er, ect, everyon]",1


In [51]:
lm = nltk.WordNetLemmatizer()
def lemmatizer_on_text(data):
    text = [lm.lemmatize(word) for word in data]
    return data

data['text'] = data['text'].apply(lambda x: lemmatizer_on_text(x))

In [52]:
data['text'].head()

799999               [love, healthuandpet, u, guy, r, best]
800000    [im, mr, ting, one, besti, tonight, cant, wait...
800001    [darealsunisakim, thank, twir, er, ar, sunisa,...
800002    [sick, rear, y, cheap, hurt, much, eat, real, ...
800003                   [lovesbrr, klyn, er, ect, everyon]
Name: text, dtype: object