In [1]:
#Libraries used for data importing and cleaning
import pandas as pd
import re
import string

#Random forest library
from sklearn.ensemble import RandomForestClassifier

#To score the results
from sklearn import model_selection

In [2]:
#Import the data
tweet = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
#Next I am going to concat the two datasets so that the data cleaning I do on them are equal
df=pd.concat([tweet,test])

In [4]:
#I am going to drop the id, keyword, and location columns now
df = df.drop(['id','keyword','location'], axis=1)

In [5]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [6]:
df['text']=df['text'].apply(lambda x : remove_URL(x))
df['text']=df['text'].apply(lambda x : remove_html(x))
df['text']=df['text'].apply(lambda x : remove_punct(x))

In [7]:
df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this earthquake Ma...,1.0
1,Forest fire near La Ronge Sask Canada,1.0
2,All residents asked to shelter in place are be...,1.0
3,13000 people receive wildfires evacuation orde...,1.0
4,Just got sent this photo from Ruby Alaska as s...,1.0


In [8]:
df_noemoji = df.copy()
df_noemoji['text']=df_noemoji['text'].apply(lambda x: remove_emoji(x))

In [9]:
#The next thing that needs to be done is to tokenize the text, I will make a new DF for this
df_token_noemoji = df_noemoji.copy()

In [10]:
#This function
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

df_token_noemoji['text'] = df_token_noemoji['text'].apply(lambda x: tokenize(x.lower()))

In [11]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(tokenized_text):
    text = [word for word in tokenized_text if not word in stop_words]
    return text

df_token_noemoji['text'] = df_token_noemoji['text'].apply(lambda x: remove_stopwords(x))

In [12]:
df_token_noemoji.head()

Unnamed: 0,text,target
0,"[deeds, reason, earthquake, may, allah, forgiv...",1.0
1,"[forest, fire, near, la, ronge, sask, canada]",1.0
2,"[residents, asked, shelter, place, notified, o...",1.0
3,"[13000, people, receive, wildfires, evacuation...",1.0
4,"[got, sent, photo, ruby, alaska, smoke, wildfi...",1.0


In [28]:
# df_noemoji = df.copy()
# df_noemoji['text']=df_noemoji['text'].apply(lambda x: remove_emoji(x))

In [29]:
# #The next thing that needs to be done is to tokenize the text, I will make a new DF for this
# df_token_noemoji = df_noemoji.copy()

In [30]:
# #This function
# def tokenize(text):
#     tokens = re.split('\W+', text)
#     return tokens

# df_token_noemoji['text'] = df_token_noemoji['text'].apply(lambda x: tokenize(x.lower()))

In [13]:
#Now that I have the most basic data structure that I want, I can start building my first model
#I'm simply going to use TFIDF and a random forest model for my baseline
#First I need to split the data back into training and testing sets
training_df=df_token_noemoji[df_token_noemoji['target'].notnull()]
test_df=df_token_noemoji[df_token_noemoji['target'].isnull()]
X_train = training_df.drop('target', axis=1)
y_train = training_df['target']
X_test = test_df.drop('target', axis=1)

In [14]:
import gensim
import numpy as np

In [15]:
w2v_model = gensim.models.Word2Vec(X_train,
                                   size=100,
                                   window=5,
                                   min_count=2)

In [16]:
words = set(w2v_model.wv.index2word)
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                        for ls in X_train['text']])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                        for ls in X_test['text']])

In [18]:
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))

X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [23]:
type(y_train.values.ravel())

numpy.ndarray

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train_vect_avg, y_train.values.ravel(), test_size=0.33, random_state=42)

In [40]:
rf = RandomForestClassifier()
rf_model = rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [42]:
rfc = RandomForestClassifier(n_estimators=10, criterion = 'entropy', random_state = 0)
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)

In [45]:
from sklearn.svm import SVC
svc_model = SVC(kernel = 'linear', random_state = 0)
svc_model.fit(X_train, y_train)
y_pred = svc_model.predict(X_test)

In [46]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)


[[1446    0]
 [1067    0]]


array([0., 0., 0., ..., 0., 0., 0.])

In [24]:
rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [25]:
scores = model_selection.cross_val_score(rf, X_train_vect_avg, y_train.values.ravel(), cv=5, scoring="f1")
scores

array([0., 0., 0., 0., 0.])