In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import gc

# progress bar
from tqdm import tqdm, tqdm_notebook

# instantiate
tqdm.pandas(tqdm_notebook)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input/"))

from scipy.sparse import hstack

from collections import Counter

from spacy.lang.en import English
nlp = English()

from sklearn import preprocessing, model_selection, metrics,pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, make_scorer,roc_curve, roc_auc_score

from sklearn.model_selection import GridSearchCV

# Any results you write to the current directory are saved as output.

['train.csv', 'sample_submission.csv', 'test.csv']


In [2]:
# Load Data
print("Loading data...")
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

print("Train shape:", train.shape)
train.head()

Loading data...
Train shape: (5279, 4)


Unnamed: 0,unique_hash,text,drug,sentiment
0,2e180be4c9214c1f5ab51fd8cc32bc80c9f612e0,Autoimmune diseases tend to come in clusters. ...,gilenya,2
1,9eba8f80e7e20f3a2f48685530748fbfa95943e4,I can completely understand why you’d want to ...,gilenya,2
2,fe809672251f6bd0d986e00380f48d047c7e7b76,Interesting that it only targets S1P-1/5 recep...,fingolimod,2
3,bd22104dfa9ec80db4099523e03fae7a52735eb6,"Very interesting, grand merci. Now I wonder wh...",ocrevus,2
4,b227688381f9b25e5b65109dd00f7f895e838249,"Hi everybody, My latest MRI results for Brain ...",gilenya,1


In [3]:
train['sentiment'].value_counts()

2    3825
1     837
0     617
Name: sentiment, dtype: int64

In [4]:
y = train['sentiment']

In [5]:
train = train.drop(['sentiment'],axis=1)

In [6]:
# function to clean data

def cleanData(doc,stemming = False):
    doc = doc.lower()
    doc = nlp(doc)
    
    tokens = [tokens.lower_ for tokens in doc]
    tokens = [tokens for tokens in doc if (tokens.is_stop == False)]
    tokens = [tokens for tokens in tokens if (tokens.is_punct == False)]
    final_token = [token.lemma_ for token in tokens]
    
    return " ".join(final_token)

In [7]:
# clean description
train['text'] = train['text'].map(lambda x: cleanData(x))
# clean description
test['text'] = test['text'].map(lambda x: cleanData(x))

In [8]:
# TF - IDF (words)

print("Fitting TF-IDF Model on words...\n")
tfv_word = TfidfVectorizer(min_df=150,  max_features= 5000, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1,3),
            stop_words = 'english')

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv_word.fit(list(train['text']) + list(test['text']))
train_tfv_word =  tfv_word.transform(train['text'])
test_tfv_word = tfv_word.transform(test['text'])

Fitting TF-IDF Model on words...



In [9]:
# TF-IDF(char)
print("Fitting TF - IDF Model on characters...\n")
tfv_char = TfidfVectorizer(sublinear_tf=True,strip_accents='unicode',analyzer='char',
    stop_words='english',ngram_range=(2, 6),max_features=10000)

tfv_char.fit(list(train['text']) + list(test['text']))
train_tfv_char = tfv_char.transform(train['text'])
test_tfv_char = tfv_char.transform(test['text'])

Fitting TF - IDF Model on characters...



In [10]:
print("Combining TF-IDF for words and characters...\n")

# TF-IDF for test set (words + char)
train_tfidf = hstack([train_tfv_word, train_tfv_char])
test_tfidf = hstack([test_tfv_word, test_tfv_char])

Combining TF-IDF for words and characters...



In [11]:
train_tfidf_final = pd.DataFrame(train_tfidf.todense())
test_tfidf_final = pd.DataFrame(test_tfidf.todense())

In [12]:
train_tfidf_final.columns = ['col' + str(x) for x in train_tfidf_final.columns]
test_tfidf_final.columns = ['col' + str(x) for x in test_tfidf_final.columns]

In [13]:
train_tfidf_final.head()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14,col15,col16,col17,col18,col19,col20,col21,col22,col23,col24,col25,col26,col27,col28,col29,col30,col31,col32,col33,col34,col35,col36,col37,col38,col39,...,col11277,col11278,col11279,col11280,col11281,col11282,col11283,col11284,col11285,col11286,col11287,col11288,col11289,col11290,col11291,col11292,col11293,col11294,col11295,col11296,col11297,col11298,col11299,col11300,col11301,col11302,col11303,col11304,col11305,col11306,col11307,col11308,col11309,col11310,col11311,col11312,col11313,col11314,col11315,col11316
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.038088,0.060002,0.060531,0.060559,0.060588,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041404,0.04373,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.238274,0.0,0.0,0.361311,0.0,0.215629,0.0,0.060641,0.0,0.0,0.0,0.0,0.080189,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.131198,0.100574,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.033309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
train_drug_df = pd.get_dummies(train['drug'])
test_drug_df = pd.get_dummies(test['drug'])

In [15]:
# merge count (tfidf) features into train
train_feats1 = pd.concat([train_drug_df, train_tfidf_final], axis = 1)
test_feats1 = pd.concat([test_drug_df, test_tfidf_final], axis=1)

test_feats1.reset_index(drop=True, inplace=True)

In [16]:
# del train_feats_df,test_feats_df,train_drug_df, train_tfidf_final,test_drug_df, test_tfidf_final,train
# gc.collect()

In [17]:
# Get missing columns in the training test
missing_cols = set( train_feats1.columns ) - set( test_feats1.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    test_feats1[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
test_feats1 = test_feats1[train_feats1.columns]

# Model Building

In [18]:
from sklearn.naive_bayes import MultinomialNB

In [19]:
def f1_score_multiclass(y_true,y_pred):
    return f1_score(y_true,y_pred,average="macro")

In [20]:
macro_f1score= make_scorer(f1_score_multiclass)

In [21]:
# nb_model = MultinomialNB()


# # parameter grid
# param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# # Initialize Grid Search Model
# model = GridSearchCV(estimator=nb_model, param_grid=param_grid, scoring=macro_f1score,
#                                  verbose=10, n_jobs=-1, iid=True, refit=True, cv=5)

# # Fit Grid Search Model
# model.fit(train_feats1, y)  # we can use the full data here but im only using xtrain. 
# print("Best score: %0.3f" % model.best_score_)
# print("Best parameters set:")
# best_parameters = model.best_estimator_.get_params()
# for param_name in sorted(param_grid.keys()):
#     print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [22]:
clf1 = MultinomialNB(alpha=0.1)
clf1.fit(train_feats1, y)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [23]:
test_feats1.shape

(2924, 11419)

In [24]:
preds1 = clf1.predict(test_feats1)

In [25]:
sub1 = pd.DataFrame({'unique_hash':test.unique_hash, 'sentiment':preds1})

In [26]:
sub1['sentiment'].value_counts()

2    1924
1     674
0     326
Name: sentiment, dtype: int64

In [27]:
## write submission files
sub1.to_csv('submission1.csv', index=False)