In [5]:
# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
import string

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn import model_selection
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from tqdm import tqdm
import gensim
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

Data Source : https://www.kaggle.com/c/nlp-getting-started/data

In [6]:
train_data = pd.read_csv(r'Data\Real_or_Not_Diaster_Tweets\train.csv')
test_data = pd.read_csv(r'Data\Real_or_Not_Diaster_Tweets\test.csv')
print('Training data shape: ', train_data.shape)
train_data.head()

Training data shape:  (7613, 5)


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [7]:
def custom_preprocessor(text):
    '''
    Make text lowercase, remove text in square brackets,remove links,remove special characters
    and remove words containing numbers.
    '''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) # remove special chars
    text = re.sub('https?://\S+|www\.\S+', '', text) # remove urls
    text = re.sub('<.*?>+', '', text)  # remove html tags
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)  
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    return text

**300dim-Glove Word emebeddings file can be downloaded from [here](http://www-nlp.stanford.edu/data/glove.840B.300d.zip)**

I could not able to upload the mentoined embedding file,as the size of the file is so huge (5.25GB)

The first step is to convert the GloVe file format to the word2vec file format. This can be done by calling the glove2word2vec() function.

In [8]:
# glove_input_file = 'glove.840B.300d.txt'
# word2vec_output_file = 'word2vec_300d_4m_glove.txt'
# glove2word2vec(glove_input_file, word2vec_output_file)

**Even the converted file (in word2vec) also in size of 5.25 GB. Please execute the above code cell & you will able to convert the Glove embeddings original format to word2vec format**

Once converted, the file can be loaded just like word2vec file

In [9]:
model_glove = KeyedVectors.load_word2vec_format('word2vec_300d_4m_glove.txt',binary = False)
print("Finished loading glove vectors in w2v format")

Finished loading glove vectors in w2v format


In [10]:
# # calculate: (king - man) + woman = ?
# result = model_glove.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
# print(result)

In [11]:
from nltk import word_tokenize

**Models with sent2vec_mean - Converting Sentence to vector by averaging the word token vectors of a given sentence**

In [15]:
def sent2vec_mean(text_data):
    Dim = model_glove.get_vector('king').shape[0]
    
    X_vector = np.zeros((len(text_data),Dim))
    i = 0
    empty_vec = 0
    for sent in tqdm(text_data):
        sent = custom_preprocessor(sent)
        tokens = word_tokenize(sent)
        tokens = [t for t in tokens if t.isalpha()]
        word_vecs = []
        for tokens in sent:
            try:
                word_vecs.append(model_glove.get_vector(tokens))
            except KeyError:
                pass
        if len(word_vecs) > 0:
            word_vecs = np.array(word_vecs)
            X_vector[i] = word_vecs.mean(axis = 0)
        else:
            empty_vec+=1
        i+=1
    
#     print("Number of samples with no words found: %s / %s" %(empty_vec, len(text_data)))
    print(f"Number of samples with no words found {empty_vec} out of {len(text_data)} samples")
    
    return X_vector

In [16]:
Xtrain = sent2vec_mean(train_data.text)
Ytrain = train_data.target

100%|████████████████████████████████████████████████████████████████████████████| 7613/7613 [00:06<00:00, 1168.82it/s]

Number of samples with no words found 0 out of 7613 samples





In [17]:
Xtrain.shape

(7613, 300)

In [27]:
Xtrain[0].shape

(300,)

In [19]:
test_data.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [20]:
Xtest = sent2vec_mean(test_data.text)

100%|████████████████████████████████████████████████████████████████████████████| 3263/3263 [00:03<00:00, 1039.33it/s]

Number of samples with no words found 0 out of 3263 samples





In [18]:
clf = LogisticRegression(C=1.0)
scores = model_selection.cross_val_score(clf, Xtrain, Ytrain, cv=5, scoring="f1")
scores

array([0.49499545, 0.50255537, 0.50724638, 0.48392555, 0.50801688])

In [21]:
# Fitting a simple Logistic Regression on Counts
clf.fit(Xtrain, Ytrain)

# Submission
sample_submission = pd.read_csv(r"Data\Real_or_Not_Diaster_Tweets\sample_submission.csv")
sample_submission["target"] = clf.predict(Xtest)
sample_submission.to_csv("submission_word_vecs_lr.csv", index=False)

It gives a score of 0.62886 on public dataset in Kaggle

In [22]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier

In [23]:
RFC = RandomForestClassifier(n_estimators = 300)
scores = model_selection.cross_val_score(RFC, Xtrain, Ytrain, cv=5, scoring="f1")
scores

array([0.46804326, 0.46598322, 0.48449612, 0.5257732 , 0.5092511 ])

In [25]:
# Fitting a simple Logistic Regression on Counts
RFC.fit(Xtrain, Ytrain)

# Submission
sample_submission = pd.read_csv(r"Data\Real_or_Not_Diaster_Tweets\sample_submission.csv")
sample_submission["target"] = RFC.predict(Xtest)
sample_submission.to_csv("submission_word_vecs_rfc.csv", index=False)

It gives a score of 0.68495 on public dataset in Kaggle

**Models with sent2vec_mean - Converting Sentence to vector by normalzing the word token vectors of a given sentence**

In [28]:
def sent2vec_norm(text_data):
    Dim = model_glove.get_vector('man').shape[0]
    
    X_vector = np.zeros((len(text_data),Dim))
    i = 0
    empty_vec = 0
    for sent in tqdm(text_data):
        sent = custom_preprocessor(sent)
        tokens = word_tokenize(sent)
        tokens = [t for t in tokens if t.isalpha()]
        word_vecs = []
        for tokens in sent:
            try:
                word_vecs.append(model_glove.get_vector(tokens))
            except KeyError:
                pass
        if len(word_vecs) > 0:
            word_vecs = np.array(word_vecs)
            vec_sum = word_vecs.sum(axis = 0)
            X_vector[i] = vec_sum/np.sqrt((vec_sum ** 2).sum())
        else:
            empty_vec+=1
        i+=1
    
#     print("Number of samples with no words found: %s / %s" %(empty_vec, len(text_data)))
    print(f"Number of samples with no words found {empty_vec} out of {len(text_data)} samples")
    
    return X_vector

In [30]:
Xtrain_norm = sent2vec_norm(train_data.text)
Ytrain = train_data.target

100%|████████████████████████████████████████████████████████████████████████████| 7613/7613 [00:05<00:00, 1470.30it/s]

Number of samples with no words found 0 out of 7613 samples





In [31]:
Xtest_norm = sent2vec_norm(test_data.text)

100%|████████████████████████████████████████████████████████████████████████████| 3263/3263 [00:02<00:00, 1452.73it/s]

Number of samples with no words found 0 out of 3263 samples





In [32]:
clf_wv = LogisticRegression(C=1.0)
# Fitting a simple Logistic Regression on Counts
clf_wv.fit(Xtrain_norm, Ytrain)

# Submission
sample_submission = pd.read_csv(r"Data\Real_or_Not_Diaster_Tweets\sample_submission.csv")
sample_submission["target"] = clf_wv.predict(Xtest_norm)
sample_submission.to_csv("submission_word_vecs_norm_lr.csv", index=False)

0.61967 - Kaggle submission score

In [33]:
RFC_norm = RandomForestClassifier(n_estimators = 300)

RFC.fit(Xtrain_norm, Ytrain)

# Submission
sample_submission = pd.read_csv(r"Data\Real_or_Not_Diaster_Tweets\sample_submission.csv")
sample_submission["target"] = RFC.predict(Xtest_norm)
sample_submission.to_csv("submission_word_vecs_norm_rfc.csv", index=False)

0.69230 - Kaggle submission score

In [None]:
# # this function creates a normalized vector for the whole sentence
# def sent2vec_norm(s):
#     words = str(s).lower().decode('utf-8')
#     words = word_tokenize(words)
#     words = [w for w in words if not w in stop_words]
#     words = [w for w in words if w.isalpha()]
#     M = []
#     for w in words:
#         try:
#             M.append(embeddings_index[w])
#         except:
#             continue
#     M = np.array(M)
#     v = M.sum(axis=0)
#     if type(v) != np.ndarray:
#         return np.zeros(300)
#     return v / np.sqrt((v ** 2).sum())