In [1]:
# reference: https://www.kaggle.com/abhishek/approaching-almost-any-nlp-problem-on-kaggle/notebook

import pandas as pd
import numpy as np

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

### Basic Exploratory Data Analysis (EDA)

In [2]:
train

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL
...,...,...,...
19574,id17718,"I could have fancied, while I looked at it, th...",EAP
19575,id08973,The lids clenched themselves together as if in...,EAP
19576,id05267,"Mais il faut agir that is to say, a Frenchman ...",EAP
19577,id17513,"For an item of news like this, it strikes us i...",EAP


In [3]:
import random
for index in range(100):
    random_index = random.randint(0, len(train['text']))
    print(train['text'][random_index] + "\n")

she is undoubtedly in painting what the Venus is in sculpture."

And yet I am happy; mothers lament their children, wives lose their husbands, while you and my children are left to me.

He found my name a good passport to preferment, and he had procured for me the situation of private secretary to the Ambassador at Vienna, where I should enter on my career under the best auspices.

As he said this he led the way across the ice; I followed.

Among a multitude of opinions upon this delicate point some acute, some learned, some sufficiently the reverse I am able to select nothing which ought to be considered satisfactory.

Glass or no glass, I must explore it a little.

In the radical theory of reanimation they saw nothing but the immature vagaries of a youthful enthusiast whose slight form, yellow hair, spectacled blue eyes, and soft voice gave no hint of the supernormal almost diabolical power of the cold brain within.

"Have you, then, some other attachment?" "None on earth.

A murmur 

### Removing punctuations

In [4]:
import string
def remove_punctuations(text):
    table = str.maketrans('','',string.punctuation)
    return text.translate(table)

example = "To a crown, a golden be gemmed crown, I hope;"
print(remove_punctuations(example))

To a crown a golden be gemmed crown I hope


In [5]:
train['text'] = train['text'].apply(lambda x : remove_punctuations(x))
test['text'] = test['text'].apply(lambda x : remove_punctuations(x))

### Encoding Features

In [6]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
author_encoded = le.fit_transform(train['author'])
author_encoded

array([0, 1, 0, ..., 0, 0, 1])

### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# vectorizer = TfidfVectorizer(use_idf=1,smooth_idf=1,stop_words = 'english')
# Result
# Logloss improved from 0.597 to 0.446
# Accuracy improved from 0.820 to 0.832

# vectorizer = TfidfVectorizer(min_df=3, use_idf=1,smooth_idf=1,stop_words = 'english')
# Result
# Logloss improved from 0.579 to 0.444
# Accuracy improved from 0.823 to 0.834

vectorizer = TfidfVectorizer(min_df=3,  max_features=None,
                             strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
                             ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
                             stop_words = 'english')
# Result
# Logloss improved from 0.576 to 0.429
# Accuracy improved from 0.830 to 0.838

vectorizer.fit(list(train['text']) + list(test['text']))
X_tfidf = vectorizer.transform(train['text']) 

In [None]:
X_tfidf

### Sampling of TF-IDF

In [None]:
X_tfidf_sample = X_tfidf[0]
X_tfidf_sample = pd.DataFrame(X_tfidf_sample.T.todense(), index=vectorizer.get_feature_names(), columns=['TF-IDF'])

# Sort TF-IDF by descending
X_tfidf_sample = X_tfidf_sample.sort_values(by=["TF-IDF"],ascending=False)

# Drop TF-IDF value less than 0
X_tfidf_sample = X_tfidf_sample[X_tfidf_sample['TF-IDF'] > 0]
X_tfidf_sample

### CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words = 'english')

# this steps generates word counts for the words in your docs
X_countv = cv.fit_transform(train['text'])

# Result
# Logloss improved from 0.453 to 0.453
# Accuracy improved from 0.826 to 0.826

### Split training and testing Data

In [None]:
from sklearn.model_selection import train_test_split

X = X_tfidf
Y = author_encoded

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=255, test_size=0.1, shuffle=True, stratify=Y)


### The orignal Kaggle has specified multi-class log-loss as evaluation metric. 

Here is the source code: https://github.com/dnouri/nolearn/blob/master/nolearn/lasagne/util.py

In [None]:
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline

def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [None]:
from sklearn.metrics import accuracy_score

def evaluate(predictions, Y_test):
    log_loss = multiclass_logloss(Y_test, predictions)
    accuracy = accuracy_score(predictions.argmax(axis=1), Y_test)
    print("Logloss: %0.3f " % log_loss)
    print("Accuracy: %0.3f " % accuracy)
    
    return log_loss, accuracy

### Basic Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Basic Naive Bayes Model
model = MultinomialNB()
model.fit(X_train, Y_train)
scores = model.predict_proba(X_test)


In [None]:
# Evaluate Basic Naive Bayes on TF-IDF
basic_Naive_Bayes_logloss, basic_Naive_Bayes_accuracy = evaluate(scores, Y_test)

### Grid Search

In [None]:
mll_scorer = metrics.make_scorer(multiclass_logloss, greater_is_better=False, needs_proba=True)

In [None]:
from sklearn.model_selection import GridSearchCV

nb_model = MultinomialNB()

# Create the pipeline 
clf = pipeline.Pipeline([('nb', nb_model)])

# parameter grid
param_grid = {'nb__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# Initialize Grid Search Model
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer,
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)

# Fit Grid Search Model
model.fit(X_train, Y_train)  # we can use the full data here but im only using xtrain. 
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

### Naive Bayes after Grid Search

In [None]:
model = MultinomialNB(alpha= 0.1)
model.fit(X_train, Y_train)
scores = model.predict_proba(X_test)


In [None]:
grid_Search_Naive_Bayes_logloss, grid_Search_Naive_Bayes_accuracy = evaluate(scores, Y_test)

print("Logloss improved from %0.3f to %0.3f" % (basic_Naive_Bayes_logloss, grid_Search_Naive_Bayes_logloss))
print("Accuracy improved from %0.3f to %0.3f" % (basic_Naive_Bayes_accuracy, grid_Search_Naive_Bayes_accuracy))

In [None]:
test_tfidf = vectorizer.transform(test['text']) 
prediction = model.predict_proba(test_tfidf)
prediction

In [None]:
import csv
with open('data\result.csv', 'w') as f:
    f.write("%s,%s,%s,%s\n"%('id', le.classes_[0], le.classes_[1], le.classes_[2]))
    count = 1
    for key in range(len(test['id'])):
        col = prediction[key]
        f.write("%s,%f,%f,%f\n"%(test['id'][key], col[0], col[1], col[2]))
        count += 1
f.close()