In [3]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# import string
# import nltk
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

%matplotlib inline

In [4]:
#Pull in the data

train = pd.read_csv('Resources/train.csv', header=None, names=["target", "id", "date", "no_query", "user", "tweet"])
train.head()
#test data is coming from API?

Unnamed: 0,target,id,date,no_query,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
#Remove no_query column
train = train.drop(columns=['no_query'])

In [6]:
#Remove links

train['tidy_tweet'] = train['tweet'].str.replace('http\S+|www.\S+', '', case=False)
train.head()

Unnamed: 0,target,id,date,user,tweet,tidy_tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","@switchfoot - Awww, that's a bummer. You sho..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,is upset that he can't update his Facebook by ...,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,mattycus,@Kenichan I dived many times for the ball. Man...,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,ElleCTF,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,Karoli,"@nationwideclass no, it's not behaving at all....","@nationwideclass no, it's not behaving at all...."


In [7]:
#Remove handles

def remove_pattern(input_txt, pattern):
  r = re.findall(pattern, input_txt)
  for i in r:
    input_txt = re.sub(i, '', input_txt)

  return input_txt

train['tidy_tweet'] = np.vectorize(remove_pattern)(train['tidy_tweet'], "@[\w]*")

In [8]:
#Remove special characters, number, and punctuation

train['tidy_tweet'] = train['tidy_tweet'].str.replace("[^a-zA-Z#]", " " )

In [9]:
#Remove short words

train['tidy_tweet'] = train['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [10]:
train.head()

Unnamed: 0,target,id,date,user,tweet,tidy_tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",Awww that bummer shoulda David Carr Third
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,is upset that he can't update his Facebook by ...,upset that update Facebook texting might resul...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,mattycus,@Kenichan I dived many times for the ball. Man...,dived many times ball Managed save rest bounds
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,ElleCTF,my whole body feels itchy and like its on fire,whole body feels itchy like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,Karoli,"@nationwideclass no, it's not behaving at all....",behaving here because over there


In [11]:
#Tokenization, breaking up the tweets into single words

tokenized_tweet = train['tidy_tweet'].apply(lambda x: x.split())
tokenized_tweet

0          [Awww, that, bummer, shoulda, David, Carr, Third]
1          [upset, that, update, Facebook, texting, might...
2          [dived, many, times, ball, Managed, save, rest...
3                    [whole, body, feels, itchy, like, fire]
4                     [behaving, here, because, over, there]
                                 ...                        
1048571                      [GrandMa, making, Dinenr, with]
1048572        [morning, snack, time, bowl, cheese, noodles]
1048573    [same, here, like, from, Terminiator, movies, ...
1048574                               [great, thaanks, wbuu]
1048575                    [cant, wait, date, this, weekend]
Name: tidy_tweet, Length: 1048576, dtype: object

In [12]:
#Stemming, converting each tokenized word into its root form and tense
#Reference: https://www.datacamp.com/community/tutorials/stemming-lemmatization-python

from nltk.stem.porter import *
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x])

In [13]:
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
    
train['stemmed_tweet'] = tokenized_tweet
train.head()

Unnamed: 0,target,id,date,user,tweet,tidy_tweet,stemmed_tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",Awww that bummer shoulda David Carr Third,awww that bummer shoulda david carr third
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,is upset that he can't update his Facebook by ...,upset that update Facebook texting might resul...,upset that updat facebook text might result sc...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,mattycus,@Kenichan I dived many times for the ball. Man...,dived many times ball Managed save rest bounds,dive mani time ball manag save rest bound
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,ElleCTF,my whole body feels itchy and like its on fire,whole body feels itchy like fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,Karoli,"@nationwideclass no, it's not behaving at all....",behaving here because over there,behav here becaus over there


In [14]:
train2 = train.tidy_tweet.str.split(expand=True)
train2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,Awww,that,bummer,shoulda,David,Carr,Third,,,,...,,,,,,,,,,
1,upset,that,update,Facebook,texting,might,result,School,today,also,...,,,,,,,,,,
2,dived,many,times,ball,Managed,save,rest,bounds,,,...,,,,,,,,,,
3,whole,body,feels,itchy,like,fire,,,,,...,,,,,,,,,,
4,behaving,here,because,over,there,,,,,,...,,,,,,,,,,


In [15]:
train_final = pd.concat([train, train2], axis=1)

In [16]:
train_final.head()

Unnamed: 0,target,id,date,user,tweet,tidy_tweet,stemmed_tweet,0,1,2,...,17,18,19,20,21,22,23,24,25,26
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",Awww that bummer shoulda David Carr Third,awww that bummer shoulda david carr third,Awww,that,bummer,...,,,,,,,,,,
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,is upset that he can't update his Facebook by ...,upset that update Facebook texting might resul...,upset that updat facebook text might result sc...,upset,that,update,...,,,,,,,,,,
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,mattycus,@Kenichan I dived many times for the ball. Man...,dived many times ball Managed save rest bounds,dive mani time ball manag save rest bound,dived,many,times,...,,,,,,,,,,
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,ElleCTF,my whole body feels itchy and like its on fire,whole body feels itchy like fire,whole bodi feel itchi like fire,whole,body,feels,...,,,,,,,,,,
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,Karoli,"@nationwideclass no, it's not behaving at all....",behaving here because over there,behav here becaus over there,behaving,here,because,...,,,,,,,,,,


In [17]:
train_final = train_final.nlargest(10000,'id')

In [18]:
train_final.to_csv('train_tableau.csv',index=False)

In [20]:
train.to_csv('train.csv',index=False)