In [5]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# import string
# import nltk
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

%matplotlib inline

In [3]:
#Pull in the data

train = pd.read_csv('Resources/train.csv', header=None, names=["target", "id", "date", "no_query", "user", "tweet"])
train.head()
#test data is coming from API?

Unnamed: 0,target,id,date,no_query,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
#Remove handles

def remove_pattern(input_txt, pattern):
  r = re.findall(pattern, input_txt)
  for i in r:
    input_txt = re.sub(i, '', input_txt)

  return input_txt

train['tidy_tweet'] = np.vectorize(remove_pattern)(train['tweet'], "@[\w]*")

In [9]:
#Remove special characters, number, and punctuation

train['tidy_tweet'] = train['tidy_tweet'].str.replace("[^a-zA-Z#]", " " )

In [11]:
#Remove short words

train['tidy_tweet'] = train['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [12]:
train.head()

Unnamed: 0,target,id,date,no_query,user,tweet,tidy_tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",http twitpic Awww that bummer shoulda David Ca...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset that update Facebook texting might resul...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,dived many times ball Managed save rest bounds
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole body feels itchy like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",behaving here because over there


In [13]:
#Tokenization, breaking up the tweets into single words

tokenized_tweet = train['tidy_tweet'].apply(lambda x: x.split())
tokenized_tweet

0          [http, twitpic, Awww, that, bummer, shoulda, D...
1          [upset, that, update, Facebook, texting, might...
2          [dived, many, times, ball, Managed, save, rest...
3                    [whole, body, feels, itchy, like, fire]
4                     [behaving, here, because, over, there]
                                 ...                        
1048571                      [GrandMa, making, Dinenr, with]
1048572        [morning, snack, time, bowl, cheese, noodles]
1048573    [same, here, like, from, Terminiator, movies, ...
1048574                               [great, thaanks, wbuu]
1048575                    [cant, wait, date, this, weekend]
Name: tidy_tweet, Length: 1048576, dtype: object

In [14]:
#Stemming, converting each tokenized word into its root form and tense
#Reference: https://www.datacamp.com/community/tutorials/stemming-lemmatization-python

from nltk.stem.porter import *
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x])

In [15]:
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
    
train['stemmed_tweet'] = tokenized_tweet
train.head()

Unnamed: 0,target,id,date,no_query,user,tweet,tidy_tweet,stemmed_tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",http twitpic Awww that bummer shoulda David Ca...,http twitpic awww that bummer shoulda david ca...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset that update Facebook texting might resul...,upset that updat facebook text might result sc...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,dived many times ball Managed save rest bounds,dive mani time ball manag save rest bound
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole body feels itchy like fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",behaving here because over there,behav here becaus over there
