**DIFFERENT MODELS IMPLEMENTATION FOR DETECTING DISASTER TWEETS**


In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/test.csv
/kaggle/input/nlp-getting-started/train.csv


In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

**1. DATA EXPLORATION**

In [3]:
train_data = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
print("Data shape = ", train_data.shape)
train_data.head()
test_data = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
test_data

Data shape =  (7613, 5)


Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


**2. DATA PREPROCESSING**


In [4]:
# Missing data
total = train_data.isnull().sum()
percentage = (train_data.isnull().sum()/train_data.isnull().count()*100)
missing_data = pd.concat([total, percentage], axis=1, keys = ['Total', 'Percentage'])
missing_data

Unnamed: 0,Total,Percentage
id,0,0.0
keyword,61,0.801261
location,2533,33.272035
text,0,0.0
target,0,0.0


In [5]:
train_data = train_data.drop(columns = ['id', 'location', 'keyword'])

TEXT PREPROCESSING
- Corpus: is a large and structured set of texts. We can consider it as simplified version of out text data that contain clean and benefit data.
- Bag of word : In practice, the Bag-of-words model is mainly used as a tool of feature generation. After transforming the text into a "bag of words", we can calculate various measures to characterize the text wikipedia

In [6]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

def clean_data (data):
    corpus = []
    pstem = PorterStemmer()

    for i in range(data.shape[0]):
        #Remove unwanted words
        tweet = re.sub("[^a-zA-Z]", ' ', data['text'][i])

        #Lower case
        tweet = tweet.lower()
        tweet = tweet.split()
    
        #Remove stop words and steeming words(take the roots)
        tweet = [pstem.stem(word) for word in tweet if not word in set (stopwords.words('english'))]
        tweet = ' '.join(tweet)
    
        #Append clean tweet to corpus
        corpus.append(tweet)
    return corpus

In [7]:
# To reduce bag of words dimensionality, we should remove those words that are
# repeated very few times. So, we create a dictionary where key refer to word 
# and value refer to word frequents in all tweets



def bagOfWords(data, data_corpus):
    uniqueWordFrequents = {}

    for tweet in data_corpus: 
        for word in tweet.split():
            if(word in uniqueWordFrequents.keys()):
                uniqueWordFrequents[word] +=1
            else:
                uniqueWordFrequents[word] = 1

    # Convert dictionary to dataFrame
    uniqueWordFrequents = pd.DataFrame.from_dict(uniqueWordFrequents, orient = 'index', columns = ['Word Frequent'])
    #uniqueWordFrequents.sort_values(by=['Word Frequent'], inplace = True, ascending=False)
    # We take only words repeated more than 10 times
    uniqueWordFrequents = uniqueWordFrequents[uniqueWordFrequents['Word Frequent'] >= 20]
    
    # Create Bag of words --> they contain only unique words in corpus

    from sklearn.feature_extraction.text import CountVectorizer

    counVec = CountVectorizer(max_features = uniqueWordFrequents.shape[0])

    bagWords = counVec.fit_transform(data_corpus).toarray()
    return bagWords

**3. MODELS**

In [8]:
train_data_corpus = clean_data(train_data)
train_data_bagWords = bagOfWords(train_data, train_data_corpus)
print(train_data_bagWords.shape)
train_data_bagWords


(7613, 787)


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [9]:
X = train_data_bagWords
y = train_data ['target']

print('X shape: ', X.shape)
print('y shape: ', y.shape)
X

X shape:  (7613, 787)
y shape:  (7613,)


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=55, shuffle=True)

X_train
print('X_train shape: ', X_train.shape)
print('y_train shape: ', y_train.shape)

X_train shape:  (6090, 787)
y_train shape:  (6090,)


# 3.1. Decision Tree Model

In [29]:
from sklearn.tree import DecisionTreeClassifier

decisionTreeModel = DecisionTreeClassifier (criterion = 'entropy', max_depth=None, splitter ='best', random_state = 55)
decisionTreeModel.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=55, splitter='best')

In [30]:
#Error: Number of features of the model must match the input. Model n_features is 1410 and input n_features is 700 

#You are supposed to pass numpy arrays and not lists as arguments to the DecisionTree, since your input was a list it gets trained as 70 features (1D list) and your test had list of 30 elements and the classifier sees it as 30 features.
#Nonetheless, you need to reshape your input numpy array and pass it as a matrix
#meaning: X_train.values.reshape(-1, 1) instead of X_train (it should be a numpy array not a list)
#c.fit(X_train.values.reshape(-1, 1), y_train)

#test_data_corpus = clean_data(test_data)

#test_data_bagWords = bagOfWords(test_data, test_data_corpus)


In [31]:
# PROBLEMS HERE.. THE MODEL HAS BEEN TRAINED WITH A DIFFERENT NUMBER OF FEATURES
# FROM THE TEST DATA
# I DON'T HAVE TIME TO SOLVE IT BUT THE ERROR I THINK IS SOMEWHERE IN THE BAG OF WORDS


#y_pred_test_data = decisionTreeModel.predict(test_data_bagWords)

In [32]:
#test_data = test_data.drop(columns=['keyword', 'location', 'text'])
#test_data

In [33]:
#test_data['target'] = y_pred_test_data

# 3.2. Logistic Regression

In [34]:
from sklearn.linear_model import LogisticRegression

LogisticRegression = LogisticRegression(penalty='l2',solver='saga', random_state = 55)
LogisticRegression.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=55, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

# 3.3. Stochastic Gradient Descent Model

In [35]:
from sklearn.linear_model import SGDClassifier


SGDClassifier = SGDClassifier(loss = 'hinge', 
                              penalty = 'l1',
                              learning_rate = 'optimal',
                              random_state = 55, 
                              max_iter=100)

SGDClassifier.fit(X_train,y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=100, n_iter_no_change=5, n_jobs=None, penalty='l1',
              power_t=0.5, random_state=55, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

# 3.4. Support Vector Machine


In [36]:
from sklearn.svm import SVC

SVClassifier = SVC(kernel= 'linear',
                   degree=3,
                   max_iter=10000,
                   C=2, 
                   random_state = 55)

SVClassifier.fit(X_train,y_train)



SVC(C=2, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=10000, probability=False, random_state=55, shrinking=True,
    tol=0.001, verbose=False)

# 3.5. Gaussian Naive Bayes Model

In [37]:
from sklearn.naive_bayes import GaussianNB

gaussianNBModel = GaussianNB()
gaussianNBModel.fit(X_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

# 3.6. Multinomial Naive Bayes Model

In [38]:
from sklearn.naive_bayes import MultinomialNB

multinomialNBModel = MultinomialNB(alpha=0.1)
multinomialNBModel.fit(X_train,y_train)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

# 3.7. Bernoulli Naive Bayes Model

In [39]:
from sklearn.naive_bayes import BernoulliNB

bernoulliNBModel = BernoulliNB(alpha=0.1)
bernoulliNBModel.fit(X_train,y_train)

BernoulliNB(alpha=0.1, binarize=0.0, class_prior=None, fit_prior=True)

# 3.8. K-Nearest Neighbors model

In [40]:
from sklearn.neighbors import KNeighborsClassifier

KNeighborsModel = KNeighborsClassifier(n_neighbors = 7,
                                       weights = 'distance',
                                      algorithm = 'brute')

KNeighborsModel.fit(X_train,y_train)

KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='distance')

# 3.9. Gradient Boosting model


In [41]:
from sklearn.ensemble import GradientBoostingClassifier

gradientBoostingModel = GradientBoostingClassifier(loss = 'deviance',
                                                   learning_rate = 0.01,
                                                   n_estimators = 100,
                                                   max_depth = 30,
                                                   random_state=55)

gradientBoostingModel.fit(X_train,y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.01, loss='deviance', max_depth=30,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=55, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

# 3.10. Vooting classifier model

In [42]:
from sklearn.ensemble import VotingClassifier

modelsNames = [('LogisticRegression',LogisticRegression),
               ('SGDClassifier',SGDClassifier),
               ('SVClassifier',SVClassifier),
               ('bernoulliNBModel',bernoulliNBModel),
               ('multinomialNBModel',multinomialNBModel)]

votingClassifier = VotingClassifier(voting = 'hard',estimators= modelsNames)
votingClassifier.fit(X_train,y_train)



VotingClassifier(estimators=[('LogisticRegression',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=55, solver='saga',
                                                 tol=0.0001, verbose=0,
                                                 warm_start=False)),
                             ('SGDClassifier',
                              SGDClassifier(alpha=0.0001, average=False,
                                            class_w...
                                  gamma='scale', kernel='linear',
                                  ma

# 4. Evaluation

In [43]:
from sklearn.metrics import f1_score
models = [decisionTreeModel, gradientBoostingModel, KNeighborsModel, LogisticRegression, 
          SGDClassifier, SVClassifier, bernoulliNBModel, gaussianNBModel, multinomialNBModel, votingClassifier]

for model in models:
    print(type(model).__name__,' Train Score is   : ' ,model.score(X_train, y_train))
    print(type(model).__name__,' Test Score is    : ' ,model.score(X_test, y_test))
    
    y_pred = model.predict(X_test)
    print(type(model).__name__,' F1 Score is      : ' ,f1_score(y_test,y_pred))
    print('--------------------------------------------------------------------------')

DecisionTreeClassifier  Train Score is   :  0.9761904761904762
DecisionTreeClassifier  Test Score is    :  0.7419566644780039
DecisionTreeClassifier  F1 Score is      :  0.6743993371996686
--------------------------------------------------------------------------
GradientBoostingClassifier  Train Score is   :  0.8586206896551725
GradientBoostingClassifier  Test Score is    :  0.7544320420223244
GradientBoostingClassifier  F1 Score is      :  0.6375968992248061
--------------------------------------------------------------------------
KNeighborsClassifier  Train Score is   :  0.9761904761904762
KNeighborsClassifier  Test Score is    :  0.7406434668417596
KNeighborsClassifier  F1 Score is      :  0.5872518286311389
--------------------------------------------------------------------------
LogisticRegression  Train Score is   :  0.8502463054187193
LogisticRegression  Test Score is    :  0.7826657912015759
LogisticRegression  F1 Score is      :  0.7230125523012553
-------------------------