# Approaching (almost) every NLP Problem

This is my first notebook for Data Science, for starter I will write some simple models and standard feature extraction.

## I Importing Libraries

In [4]:
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import batch_normalization
from keras.utils import np_utils
from sklearn import model_selection, decomposition, preprocessing, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalAveragePooling1D, Conv1D, MaxPooling1D, Flatten, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

## II Dataset

In [6]:
train = pd.read_csv("../data/spooky/train.csv")
test = pd.read_csv("../data/spooky/test.csv")
sample = pd.read_csv("../data/spooky/sample_submission.csv")

train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [9]:
test.head()

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...


In [10]:
sample.head()

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.403494,0.287808,0.308698
1,id24541,0.403494,0.287808,0.308698
2,id00134,0.403494,0.287808,0.308698
3,id27757,0.403494,0.287808,0.308698
4,id04081,0.403494,0.287808,0.308698


## III Multiclass logloss evaluation metric by Kaggle
In the next cell, I will write the code of the multiclass logloss evaluation metric given by kaggle for this specific competition (spooky).

In [23]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2
        
    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))

    return -1.0 / rows * vsota

## IV Convert text labels to numerical labels

In [10]:
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(train.author.values)
print(y[:50])

[0 1 0 2 1 2 0 0 0 2 2 0 1 1 0 2 0 2 0 1 0 1 0 0 0 0 0 0 1 1 0 2 2 1 1 1 1
 2 1 0 1 1 1 2 0 0 2 1 1 0]


## V Split the dataset


In [11]:
x_train, x_valid, y_train, y_valid = train_test_split(train.text.values, y, stratify=y, random_state=42, shuffle=True, test_size=0.1)
print(x_train.shape)
print(x_valid.shape)

(17621,)
(1958,)


## VI Building basic models

I will start with the Logistic Regression using term-frequency - inverse-document-frequency features. 

In [19]:
tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', 
analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 3), 
use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english')

tfv.fit(list(x_train) + list(x_valid))

x_train_tfv = tfv.transform(x_train)
x_valid_tfv = tfv.transform(x_valid)

print(x_train_tfv.shape)

(17621, 15102)


In [24]:
# Fitting a simple Logistic Regression on TFIDF
clf = LogisticRegression()
clf.fit(x_train_tfv, y_train)
predictions = clf.predict_proba(x_valid_tfv)

print(predictions)

print("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))


[[0.6943152  0.07172106 0.23396374]
 [0.79796084 0.08133734 0.12070181]
 [0.61024582 0.16350201 0.22625217]
 ...
 [0.30012424 0.25166445 0.44821132]
 [0.20335046 0.16891225 0.6277373 ]
 [0.05947416 0.90771653 0.03280932]]
logloss: 0.572 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
# Now using a count vector
ctv = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', 
ngram_range=(1, 3), stop_words = 'english')

ctv.fit(list(x_train) + list(x_valid))
x_train_ctv = ctv.transform(x_train)
x_valid_ctv = ctv.transform(x_valid)

print(x_train_ctv.shape)

(17621, 400266)


In [28]:
# Now fitting to a simple Logistic Regrassion
clf = LogisticRegression()
clf.fit(x_train_ctv, y_train)
predictions = clf.predict_proba(x_valid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))

logloss: 0.527 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [30]:
# Fitting a simple Naive Bayes on TFIDF
clf = MultinomialNB()
clf.fit(x_train_tfv, y_train)
predictions = clf.predict_proba(x_valid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))

logloss: 0.578 


In [32]:
clf = MultinomialNB()
clf.fit(x_train_ctv, y_train)
predictions = clf.predict_proba(x_valid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))

logloss: 0.485 


In [33]:
# Now I will try SVM, but first I will reduce da dimenisonality with singular valued decomposition
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(x_train_tfv)

x_train_svd = svd.transform(x_train_tfv)
x_valid_svd = svd.transform(x_valid_tfv)

scl = preprocessing.StandardScaler()
scl.fit(x_train_svd)
x_train_svd_scl = scl.transform(x_train_svd)
x_valid_svd_scl = scl.transform(x_valid_svd)


In [34]:
# Fitting a simple SVM
clf = SVC(probability=True)
clf.fit(x_train_svd_scl, y_train)
predictions = clf.predict_proba(x_valid_svd_scl)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))


logloss: 0.735 
