In [1]:
# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
import string

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer
from sklearn import model_selection
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

Data Source : https://www.kaggle.com/c/nlp-getting-started/data

In [2]:
train_data = pd.read_csv(r'Data\Real_or_Not_Diaster_Tweets\train.csv')
test_data = pd.read_csv(r'Data\Real_or_Not_Diaster_Tweets\test.csv')
print('Training data shape: ', train_data.shape)
train_data.head()

Training data shape:  (7613, 5)


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


By default, a scikit learn Count vectorizer can perform the following opertions over a text corpus:

- Encoding via utf-8
- converts text to lowercase
- Tokenizes text using word level tokenization

In [3]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [4]:
count_vectorizer = CountVectorizer(stop_words = stopwords)
count_vectorizer.fit(train_data['text'])

train_vectors = count_vectorizer.transform(train_data['text'])
test_vectors = count_vectorizer.transform(test_data['text'])

train_vectors.shape

(7613, 21498)

**MIN_DF** lets you ignore those terms that appear rarely in a corpus. In other words, if MIN_dfis 2, it means that a word has to occur at least two documents to be considered useful.

**MAX_DF** on the other hand, ignores terms that have a document frequency strictly higher than the given threshold.These will be words which appear a lot of documents.

This means we can eliminate those words that are either rare or appear too frequently in a corpus.

When mentioned in absolute values i.e 1,2, etc, the value means if the word appears in 1 or 2 documents. However, when given in float, eg 30%, it means it appears in 30% of the documents.

In [5]:
count_vectorizer = CountVectorizer(stop_words = stopwords, min_df=2 ,max_df=0.8)
count_vectorizer.fit(train_data['text'])

train_vectors = count_vectorizer.transform(train_data['text'])
test_vectors = count_vectorizer.transform(test_data['text'])

In [6]:
cv_df = pd.DataFrame(train_vectors.toarray(), columns =count_vectorizer.get_feature_names() )
cv_df.head()

Unnamed: 0,00,000,01,02,03,030,04,05,06,07,...,ûïa,ûïhatchet,ûïrichmond,ûïstretcher,ûïthe,ûïwe,ûïwhen,ûïyou,ûò,ûó
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 1.3.Custom Preprocesser
We can also preprocess the text by passing it as an argument to countvectorizer. The following options are avialable:

strip_accents - This removes any accents from the text during the preprocessing step.
lowercase - which is default set as true but can be set to False if lowercasing isnot desired
preprocessor - we can create our custom preprocessor and set this argument to that.

In [7]:
# Creating a custom preprocessor that lowercases, removes special characters, removes hyperlinks and punctuation
#review = re.sub('[^a-zA-Z]', ' ', sentences[i])
def custom_preprocessor(text):
    '''
    Make text lowercase, remove text in square brackets,remove links,remove special characters
    and remove words containing numbers.
    '''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) # remove special chars
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)  
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    return text

# Creating a baseline model using CountVectorizer

In [8]:
count_vectorizer = CountVectorizer(token_pattern=r'\w{1,}',
                   ngram_range=(1, 2), stop_words = stopwords,preprocessor=custom_preprocessor)

count_vectorizer .fit(train_data['text'])

train_vectors = count_vectorizer.transform(train_data['text'])
test_vectors = count_vectorizer.transform(test_data['text'])

## Logistic Regression with Count Vectorizer

In [9]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1.0)
scores = model_selection.cross_val_score(clf, train_vectors, train_data["target"], cv=5, scoring="f1")
scores

array([0.59516908, 0.52846975, 0.61734694, 0.52475248, 0.70401338])

In [10]:
# Fitting a simple Logistic Regression on Counts
clf.fit(train_vectors, train_data["target"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
train_F1_score_logR = metrics.f1_score(train_data["target"],clf.predict(train_vectors))
train_F1_score_logR

0.9816045756685733

In [19]:
# Submission
sample_submission = pd.read_csv(r"Data\Real_or_Not_Diaster_Tweets\sample_submission.csv")
sample_submission["target"] = clf.predict(test_vectors)
sample_submission.to_csv("submission_LR_count.csv", index=False)

It gives me a score of 0.80416 on public dataset in Kaggle

## Naive Bayes with Count Vectorizer

In [12]:
from sklearn.naive_bayes import MultinomialNB

In [13]:
clf_NB = MultinomialNB()
scores = model_selection.cross_val_score(clf_NB, train_vectors, train_data["target"], cv=5, scoring="f1")
print(scores)

# Fitting a simple Naive Bayes model on Counts
clf_NB.fit(train_vectors, train_data["target"])

# Submission
sample_submission = pd.read_csv(r"Data\Real_or_Not_Diaster_Tweets\sample_submission.csv")
sample_submission["target"] = clf_NB.predict(test_vectors)
sample_submission.to_csv("submission_NB_count.csv", index=False)

[0.64784314 0.60490463 0.68660775 0.65239852 0.71400904]


It gives a score of 0.79129 on public dataset in Kaggle

# Creating a baseline model using TfidfVectorizer`

In [14]:
tfidf_vectorizer = TfidfVectorizer( min_df=3,  max_features=None,analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = stopwords)

train_tfidf = tfidf_vectorizer.fit_transform(train_data['text'])
test_tfidf = tfidf_vectorizer.transform(test_data["text"])

## Logistic Regression with TFidf Vectorizer

In [15]:
clf_tv = LogisticRegression(C=1.0)
scores = model_selection.cross_val_score(clf_tv, train_tfidf, train_data["target"], cv=5, scoring="f1")
print(scores)
# Fitting a simple Logistic Regression on TFIDF
clf_tv.fit(train_tfidf, train_data["target"])
# Submission
sample_submission = pd.read_csv(r"Data\Real_or_Not_Diaster_Tweets\sample_submission.csv")
sample_submission["target"] = clf_tv.predict(test_tfidf)
sample_submission.to_csv("submission_LR_tfidf.csv", index=False)

[0.55228426 0.50776256 0.55160142 0.48971193 0.65859983]


It gives a score of 0.79711 on public dataset

## Naive Bayes with Tfidf Vectorizer

In [16]:
clf_NB_tv = MultinomialNB()
scores = model_selection.cross_val_score(clf_NB_tv, train_vectors, train_data["target"], cv=5, scoring="f1")
print(scores)

# Fitting a simple Naive Bayes model on Counts
clf_NB_tv.fit(train_tfidf, train_data["target"])

# Submission
sample_submission = pd.read_csv(r"Data\Real_or_Not_Diaster_Tweets\sample_submission.csv")
sample_submission["target"] = clf_NB_tv.predict(test_tfidf)
sample_submission.to_csv("submission_NB_tfidf.csv", index=False)

[0.64784314 0.60490463 0.68660775 0.65239852 0.71400904]


It gives a score of 0.79037 on public dataset in Kaggle

In [17]:
from tqdm import tqdm
import gensim
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

In [18]:
glove_input_file = 'glove.840B.300d.txt'
word2vec_output_file = 'word2vec_300d_4m_glove.txt'
glove2word2vec(glove_input_file, word2vec_output_file)

(2196017, 300)

In [38]:
model_glove = KeyedVectors.load_word2vec_format(word2vec_output_file,binary = False)

In [None]:
# calculate: (king - man) + woman = ?
result = model_glove.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)