In [19]:
#Libraries used for data importing and cleaning
import pandas as pd
import re
import string

#Libraries used for TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
#Import the data
tweet = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
#Next I am going to concat the two datasets so that the data cleaning I do on them are equal
df=pd.concat([tweet,test])

In [4]:
#There are 3 universal basic steps for data cleaning that apply to this data
#Removing url's, html tags, and punctuation
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

In [5]:
#now to execute the functions to cclean the data
df['text']=df['text'].apply(lambda x : remove_URL(x))
df['text']=df['text'].apply(lambda x : remove_html(x))
df['text']=df['text'].apply(lambda x : remove_punct(x))

In [7]:
#Another common task that must be dealt with is the emojis. 
#Some people simply like to remove them and not deal with them
#Others like to use them as additional features. 
#For now I will take thhe path of least resistence and simply remove them
df_noemoji = df.copy()

In [9]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [10]:
df_noemoji['text']=df_noemoji['text'].apply(lambda x: remove_emoji(x))

In [11]:
#The next thing that needs to be done is to tokenize the text, I will make a new DF for this
df_token_noemoji = df_noemoji.copy()

In [12]:
df_token_noemoji.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this earthquake Ma...,1.0
1,4,,,Forest fire near La Ronge Sask Canada,1.0
2,5,,,All residents asked to shelter in place are be...,1.0
3,6,,,13000 people receive wildfires evacuation orde...,1.0
4,7,,,Just got sent this photo from Ruby Alaska as s...,1.0


In [13]:
#This function
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

df_token_noemoji['text'] = df_token_noemoji['text'].apply(lambda x: tokenize(x.lower()))

In [17]:
#The next decision I am going to make is to drop the keyword and location columns as they are not useful
df_token_noemoji_drop = df_token_noemoji.drop(['id','keyword','location'], axis=1)

In [30]:
#Now that I have the most basic data structure that I want, I can start building my first model
#I'm simply going to use TFIDF and a random forest model for my baseline
#First I need to split the data back into training and testing sets
training_df=df_token_noemoji_drop[df_token_noemoji_drop['target'].notnull()]
test_df=df_token_noemoji_drop[df_token_noemoji_drop['target'].isnull()]
X_train = training_df.drop('target', axis=1)
y_train = training_df['target']
X_test = test_df.drop('target', axis=1)

In [34]:
X_test.head()

Unnamed: 0,text
0,"[just, happened, a, terrible, car, crash]"
1,"[heard, about, earthquake, is, different, citi..."
2,"[there, is, a, forest, fire, at, spot, pond, g..."
3,"[apocalypse, lighting, spokane, wildfires]"
4,"[typhoon, soudelor, kills, 28, in, china, and,..."


In [35]:
#Now to do some TF-IDF
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['text'])
X_train_vect = tfidf_vect.transform(X_train['text'])
X_test_vect = tfidf_vect.transform(X_test['text'])

AttributeError: 'list' object has no attribute 'lower'