In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
from tqdm import  tqdm
tqdm.pandas()
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input/awd-lstm/"))

from scipy.sparse import hstack

from spacy.lang.en import English
nlp = English()

from sklearn import preprocessing, model_selection, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, make_scorer,roc_curve, roc_auc_score


# Any results you write to the current directory are saved as output.

['itos_wt103.pkl', 'lstm_wt103.pth']


In [2]:
# Load Data
print("Loading data...")
train = pd.read_csv("../input/innoplex/train.csv")
test = pd.read_csv("../input/innoplex/test.csv")

print("Train shape:", train.shape)
train.head()

Loading data...
Train shape: (5279, 4)


Unnamed: 0,unique_hash,text,drug,sentiment
0,2e180be4c9214c1f5ab51fd8cc32bc80c9f612e0,Autoimmune diseases tend to come in clusters. ...,gilenya,2
1,9eba8f80e7e20f3a2f48685530748fbfa95943e4,I can completely understand why you’d want to ...,gilenya,2
2,fe809672251f6bd0d986e00380f48d047c7e7b76,Interesting that it only targets S1P-1/5 recep...,fingolimod,2
3,bd22104dfa9ec80db4099523e03fae7a52735eb6,"Very interesting, grand merci. Now I wonder wh...",ocrevus,2
4,b227688381f9b25e5b65109dd00f7f895e838249,"Hi everybody, My latest MRI results for Brain ...",gilenya,1


In [3]:
test.head()

Unnamed: 0,unique_hash,text,drug
0,9e9a8166b84114aca147bf409f6f956635034c08,"256 (previously stable on natalizumab), with 5...",fingolimod
1,e747e6822c867571afe7b907b51f0f2ca67b0e1a,On fingolimod and have been since December 201...,fingolimod
2,50b6d851bcff4f35afe354937949e9948975adf7,Apparently it's shingles! :-/ I do have a few ...,humira
3,7f82ec2176ae6ab0b5d20b5ffc767ac829f384ae,If the Docetaxel doing once a week x3 weeks th...,tagrisso
4,8b37d169dee5bdae27060949242fb54feb6a7f7f,"CC, Stelara worked in a matter of days for me....",stelara


In [4]:
train['sentiment'].value_counts()

2    3825
1     837
0     617
Name: sentiment, dtype: int64

In [5]:
# Check the first review

print('The first text is:\n\n',train["text"][1])
len(train['text'][1])

The first text is:

 I can completely understand why you’d want to try it. But, results reported in lectures don’t always stand up to the scrutiny of peer-review during publication. There so much still to do before this is convincing. I hope that it does work out, I really do. And if you’re aware of and happy with the risks, then that’s great. I just think it’s important to present this in a balanced way, and to understand why we don’t move straight from the first show of promise in an animal study to using drugs on humans. There’s still a lot of animal data to gather, and human data to gather before anyone can tell if it’s safe or effective. I can’t tell you how many times animal studies don’t follow through to humans, but it’s one of the major attrition points in drug development. You’ve been through some of the unpredictability issues with Cladribine/Gilenya, where there was an interaction that wasn’t predicted. But once people try it, the doctors can see patterns and work out what’

1184

In [6]:
# function to clean data

def cleanData(doc,stemming = False):
    doc = doc.lower()
    doc = nlp(doc)
    tokens = [tokens.lower_ for tokens in doc]
    tokens = [tokens for tokens in doc if (tokens.is_stop == False)]
    tokens = [tokens for tokens in tokens if (tokens.is_punct == False)]
    final_token = [token.lemma_ for token in tokens]
    
    return " ".join(final_token)

In [7]:
clean_review = cleanData(train['text'][1])
clean_review

'completely understand want try result report lecture stand scrutiny peer review publication convince hope work aware happy risk great think important present balance way understand straight promise animal study drug human lot animal datum gather human datum gather tell safe effective tell time animal study follow human major attrition point drug development unpredictability issue cladribine gilenya interaction predict people try doctor pattern work go clemastine metformin excite give say current condition personal risk tolerance make sense try definitely'

In [8]:
# clean description
print("Cleaning train data...\n")
train["text"] = train["text"].map(lambda x: cleanData(x))

Cleaning train data...



In [9]:
# clean description
print("Cleaning test data...\n")
test["text"] = test["text"].map(lambda x: cleanData(x))

Cleaning test data...



In [10]:
y = train['sentiment']

In [11]:
train = train.drop(['sentiment'],axis=1)

In [12]:
train

Unnamed: 0,unique_hash,text,drug
0,2e180be4c9214c1f5ab51fd8cc32bc80c9f612e0,autoimmune disease tend come cluster gilenya f...,gilenya
1,9eba8f80e7e20f3a2f48685530748fbfa95943e4,completely understand want try result report l...,gilenya
2,fe809672251f6bd0d986e00380f48d047c7e7b76,interest target s1p-1/5 receptor 1 5 like fing...,fingolimod
3,bd22104dfa9ec80db4099523e03fae7a52735eb6,interest grand merci wonder lemtrada ocrevus s...,ocrevus
4,b227688381f9b25e5b65109dd00f7f895e838249,hello everybody late mri result brain cervical...,gilenya
5,a043780c757966243779bf3c0d11bf6eef721971,advice lemtrada choose cladribine think drug d...,cladribine
6,be5a13376933a7f9bbf8e801c31691092f63260a,reply post jesszidek hello jess sorry read cha...,humira
7,08c3c0c702fc97d290204b37798ac62005da5626,expect neurologist want start tysabri keep say...,gilenya
8,8fd3d7ad80791c9343e5cf8a83bd1adf6577d516,think fingolimod miserable failure progressive...,fingolimod
9,793c5af7cc8332df17eb602247d886fbd1c80f89,thank i’m learn lot grace mention husband ca...,tagrisso


In [13]:
# # Bag of Words (word based)
# ctv_word = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',min_df = 200, max_features=5000,
#             ngram_range=(1,2), stop_words = 'english')


In [14]:
# print("Fitting Bag of Words Model on words...\n")
# # Fitting CountVectorizer to both training and test sets
# ctv_word.fit(list(train['text']) + list(test['text']))
# train_ctv_word =  ctv_word.transform(train['text']) 
# test_ctv_word = ctv_word.transform(test['text'])

In [15]:
# print("Fitting Bag of Words Model on characters...\n")

# # Bag of words (charater based)
# ctv_char = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode',analyzer='char',
#     stop_words='english', ngram_range=(2, 6), max_features=10000)

# # Fitting CountVectorizer to both training and test sets
# ctv_char.fit(list(train['text']) + list(test['text']))
# train_ctv_char =  ctv_char.transform(train['text']) 
# test_ctv_char = ctv_char.transform(test['text'])

In [16]:
# TF - IDF (words)

print("Fitting TF-IDF Model on words...\n")
tfv_word = TfidfVectorizer(min_df=150,  max_features= 5000, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1,3),
            stop_words = 'english')

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv_word.fit(list(train['text']) + list(test['text']))
train_tfv_word =  tfv_word.transform(train['text'])
test_tfv_word = tfv_word.transform(test['text'])

Fitting TF-IDF Model on words...



In [17]:
# TF-IDF(char)
print("Fitting TF - IDF Model on characters...\n")
tfv_char = TfidfVectorizer(sublinear_tf=True,strip_accents='unicode',analyzer='char',
    stop_words='english',ngram_range=(2, 6),max_features=10000)

tfv_char.fit(list(train['text']) + list(test['text']))
train_tfv_char = tfv_char.transform(train['text'])
test_tfv_char = tfv_char.transform(test['text'])

Fitting TF - IDF Model on characters...



In [18]:
# print("Combining Bag of words for words and characters...\n")
# # bag of words for training set (words + char)
# train_bow = hstack([train_ctv_word, train_ctv_char])
# test_bow = hstack([test_ctv_word, test_ctv_char])

print("Combining TF-IDF for words and characters...\n")

# TF-IDF for test set (words + char)
train_tfidf = hstack([train_tfv_word, train_tfv_char])
test_tfidf = hstack([test_tfv_word, test_tfv_char])

Combining TF-IDF for words and characters...



In [19]:
# create dataframe for features

# train_bow_final = pd.DataFrame(train_bow.todense())
# test_bow_final = pd.DataFrame(test_bow.todense())

train_tfidf_final = pd.DataFrame(train_tfidf.todense())
test_tfidf_final = pd.DataFrame(test_tfidf.todense())

In [20]:
# train_bow_final.columns = ['col'+ str(x) for x in train_bow_final.columns]
# test_bow_final.columns = ['col'+ str(x) for x in test_bow_final.columns]

train_tfidf_final.columns = ['col' + str(x) for x in train_tfidf_final.columns]
test_tfidf_final.columns = ['col' + str(x) for x in test_tfidf_final.columns]

In [21]:
# del train_tfv_char,train_tfv_word,test_tfidf,test_tfv_char,test_tfv_word
# gc.collect()

In [22]:
train_drug_df = pd.get_dummies(train['drug'])
test_drug_df = pd.get_dummies(test['drug'])

In [23]:
# merge count (tfidf) features into train
train_feats1 = pd.concat([train_drug_df, train_tfidf_final], axis = 1)
test_feats1 = pd.concat([test_drug_df, test_tfidf_final], axis=1)

test_feats1.reset_index(drop=True, inplace=True)

In [24]:
# # merge count (bag of word) features into train
# train_feats2 = pd.concat([train_drug_df, train_bow_final], axis = 1)
# test_feats2 = pd.concat([test_drug_df, test_bow_final], axis=1)

# test_feats2.reset_index(drop=True, inplace=True)

In [25]:
# Get missing columns in the training test
missing_cols = set( train_feats1.columns ) - set( test_feats1.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    test_feats1[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
test_feats1 = test_feats1[train_feats1.columns]

In [26]:
from sklearn.naive_bayes import GaussianNB

In [27]:
# let's check cross validation score of the model
# cv score acts a unbiased estimate of models accuracy on unseen data

mod1 = GaussianNB()

In [28]:
def f1_score_multiclass(y_true,y_pred):
    return f1_score(y_true,y_pred,average="macro")

In [29]:
## Naive Bayes 1
print(cross_val_score(mod1, train_feats1, y, cv=5, scoring=make_scorer(f1_score_multiclass)))

[0.38284038 0.39727382 0.40092248 0.38472387 0.37565525]


In [30]:
## Naive Bayes 2 - bow is giving lower CV score
# print(cross_val_score(mod1, train_feats2, y, cv=5, scoring=make_scorer(f1_score_multiclass)))

In [31]:
clf1 = GaussianNB()
clf1.fit(train_feats1, y)

GaussianNB(priors=None, var_smoothing=1e-09)

In [32]:
test_feats1.shape

(2924, 11419)

In [33]:
preds1 = clf1.predict(test_feats1)

In [34]:
sub1 = pd.DataFrame({'unique_hash':test.unique_hash, 'sentiment':preds1})

In [35]:
sub1['sentiment'].value_counts()

2    1177
1    1142
0     605
Name: sentiment, dtype: int64

In [36]:
## write submission files
sub1.to_csv('submission1.csv', index=False)