In [1]:
#import all needed libraries 
import pandas as pd
import numpy as np
import keras 
import nltk
from bs4 import BeautifulSoup
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import helpers
import re


Using TensorFlow backend.


### Data preprocessing.

In [2]:
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

Let look at the train data and test data.

In [3]:
print("Training Data:",'\n Columns:',train_data.columns, '\n Shape:', train_data.shape, len(train_data))
print("Test Data:",'\n Columns:',test_data.columns, '\n Shape:', test_data.shape, len(test_data))
train_data.head(5)

Training Data: 
 Columns: Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object') 
 Shape: (7613, 5) 7613
Test Data: 
 Columns: Index(['id', 'keyword', 'location', 'text'], dtype='object') 
 Shape: (3263, 4) 3263


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


Now let do data preprocessing and also fix labels. 
For this we use <b>helpers.py</b>. Put it in the same folder as this Jupyter notebook.

In [4]:
train_data = helpers.fix_labels(train_data)
train_data['text'] = train_data['text'].apply(lambda x : helpers.data_preprocessing(x))
test_data['text'] = test_data['text'].apply(lambda x : helpers.data_preprocessing(x))
train_data.head()

Unnamed: 0,id,keyword,location,text,target,target_fixed
0,1,,,deeds reason earthquake may allah forgive us,1,1
1,4,,,forest fire near la ronge sask canada,1,1
2,5,,,residents asked shelter place notified officer...,1,1
3,6,,,people receive wildfires evacuation orders cal...,1,1
4,7,,,got sent photo ruby alaska smoke wildfires pou...,1,1


Let divide data on the validation and train set. Note: test_size = 0.25

In [5]:
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val = train_test_split(train_data['text'], train_data['target'], test_size=0.25, random_state=42)

Before classification, we need to present tweets as TF-IDF vectors. We will use TfidfVectorizer.

In [6]:
tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))
train_tfidf = tfidf.fit_transform(X_train)
val_tfidf = tfidf.transform(X_val)
test_tfidf = tfidf.transform(test_data['text'])

### Classification

Finally, we can proceed to classification. In this project, it has been decided to work with SVM and LR. Parameters are chosen using Grid Search.

In [8]:
#import sklearn
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

#### SVM

In [21]:
#grid search
parameters = {'kernel':('linear', 'rbf','poly'), 'C':[1, 5, 10],'gamma':[0.1, 0.001, 0.0001, 'scale']}
svm = SVC()
grid = GridSearchCV(svm, parameters, scoring = 'f1_macro')
grid.fit(train_tfidf,y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 5, 10],
                         'gamma': [0.1, 0.001, 0.0001, 'scale'],
                         'kernel': ('linear', 'rbf', 'poly')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_macro', verbose=0)

In [22]:
grid.best_params_

{'C': 1, 'gamma': 0.1, 'kernel': 'linear'}

In [23]:
svm = SVC(C = 1, gamma = 0.1, kernel = 'linear')
svm.fit(train_tfidf,y_train)
y_pred = svm.predict(val_tfidf)

Let pickle our models for further usage.

In [25]:
import pickle
with open("svm.p", "wb")as f:
    pickle.dump(svm, f)

In [26]:
print(classification_report(y_val, y_pred))
print("Accuracy = ", accuracy_score(y_val,y_pred))
print("F1-score = ", f1_score(y_val,y_pred, average = 'macro'))

              precision    recall  f1-score   support

           0       0.81      0.88      0.84      1091
           1       0.81      0.72      0.76       813

    accuracy                           0.81      1904
   macro avg       0.81      0.80      0.80      1904
weighted avg       0.81      0.81      0.81      1904

Accuracy =  0.8077731092436975
F1-score =  0.8001440849458763


#### Logistic Regression

In [27]:
#grid search
from sklearn.linear_model import LogisticRegression
parameters = {'penalty':('l1', 'l2'), 'C':[0.01, 0.1,1, 5],'solver':('liblinear','lbfgs'), 'tol':[1e-4, 1e-5, 1e-3]}
lr = LogisticRegression()
grid = GridSearchCV(lr, parameters, scoring = 'f1_macro')
grid.fit(train_tfidf,y_train)

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.01, 0.1, 1, 5], 'penalty': ('l1', 'l2'),
                         'solver': ('liblinear', 'lbfgs'),
                         'tol': [0.0001, 1e-05, 0.001]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_macro', verbose=0)

In [28]:
#Logistic Regression
grid.best_params_

{'C': 1, 'penalty': 'l2', 'solver': 'lbfgs', 'tol': 0.0001}

In [29]:
lr = LogisticRegression(penalty = 'l2',C = 1, solver = 'lbfgs', tol = 0.0001)
lr.fit(train_tfidf,y_train)
y_pred = lr.predict(val_tfidf)

In [30]:
#pickle LR
with open("lr.p", "wb")as f:
    pickle.dump(lr, f)

In [31]:
print(classification_report(y_val, y_pred))
print("Accuracy = ", accuracy_score(y_val,y_pred))
print("F1-score = ", f1_score(y_val,y_pred, average = 'macro'))

              precision    recall  f1-score   support

           0       0.80      0.90      0.85      1091
           1       0.84      0.70      0.76       813

    accuracy                           0.81      1904
   macro avg       0.82      0.80      0.80      1904
weighted avg       0.82      0.81      0.81      1904

Accuracy =  0.8125
F1-score =  0.8031480533832842


#### Conclusion: from above methods LR and SVM are almost equally efficient based on accuracy and F1-score.
But still we would check them by uploading the results to the Kaggle competition page.

In [34]:
#submit SVM
y_pred_svm = svm.predict(test_tfidf)
submission_svm = pd.read_csv('data/sample_submission.csv')
submission_svm['target'] = y_pred_svm
submission_svm.to_csv('predictions/submission_svm.csv', index=False)

#submit LR
y_pred_lr = lr.predict(test_tfidf)
submission_lr = pd.read_csv('data/sample_submission.csv')
submission_lr['target'] = y_pred_lr
submission_lr.to_csv('predictions/submission_lr.csv', index=False)

