In [37]:
#Libraries used for data importing and cleaning
import pandas as pd
import numpy as np
import re
import string

In [38]:
#Import the data
tweet = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [39]:
#Next I am going to concat the two datasets so that the data cleaning I do on them are equal
df=pd.concat([tweet,test])

In [40]:
#I am going to drop the id, keyword, and location columns now
df = df.drop(['id','keyword','location'], axis=1)

In [41]:
#There are 3 universal basic steps for data cleaning that apply to this data
#Removing url's, html tags, and punctuation
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

#I am also going to add in the step of removing the emoji's and emoticons
#They too few to be helpful but present enough to be annoying
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [42]:
#now to execute the functions to cclean the data
df['text']=df['text'].apply(lambda x : remove_URL(x))
df['text']=df['text'].apply(lambda x : remove_html(x))
df['text']=df['text'].apply(lambda x : remove_punct(x))
df['text']=df['text'].apply(lambda x: remove_emoji(x))

In [43]:
df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this earthquake Ma...,1.0
1,Forest fire near La Ronge Sask Canada,1.0
2,All residents asked to shelter in place are be...,1.0
3,13000 people receive wildfires evacuation orde...,1.0
4,Just got sent this photo from Ruby Alaska as s...,1.0


In [44]:
df_feature = pd.DataFrame()

In [45]:
## Number of unique words in the text ##
df_feature['num_words'] = df['text'].apply(lambda x: len(str(x).split()))

## Number of characters in the text ##
df_feature["num_unique_words"] = df["text"].apply(lambda x: len(set(str(x).split())))

## Number of stopwords in the text ##
df_feature["num_chars"] = df["text"].apply(lambda x: len(str(x)))

## Number of title case words in the text ##
df_feature["num_punctuations"] =df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )

## Number of title case words in the text ##
df_feature["num_words_upper"] = df["text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

## Average length of the words in the text ##
df_feature["num_words_title"] = df["text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

## Mean length of the words in the text ##
df_feature["mean_word_len"] = df["text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

In [46]:
df_feature.head()

Unnamed: 0,num_words,num_unique_words,num_chars,num_punctuations,num_words_upper,num_words_title,mean_word_len
0,13,13,68,0,1,5,4.307692
1,7,7,37,0,0,5,4.428571
2,22,18,130,0,0,2,4.954545
3,8,8,63,0,0,1,6.875
4,16,15,86,0,0,3,4.375


In [47]:
df_feature['target'] = df['target']

In [48]:
#Now that I have the most basic data structure that I want, I can start building my first model
#I'm simply going to use TFIDF and a random forest model for my baseline
#First I need to split the data back into training and testing sets
training_df=df_feature[df_feature['target'].notnull()]
test_df=df_feature[df_feature['target'].isnull()]
X_train = training_df.drop('target', axis=1)
y_train = training_df['target']
X_test = test_df.drop('target', axis=1)

In [49]:
#Random forest library
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf_model = rf.fit(X_train, y_train)

In [50]:
def cv(data):
    count_vectorizer = CountVectorizer()

    emb = count_vectorizer.fit_transform(data)

    return emb, count_vectorizer

In [51]:
#To score the results
from sklearn import model_selection
scores = model_selection.cross_val_score(rf, X_train, y_train, cv=5, scoring="f1")
scores

array([0.47472151, 0.51827782, 0.49800479, 0.48400328, 0.53225806])