In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")

In [3]:
print(train.shape)
print(test.shape)

(5279, 4)
(2924, 3)


In [4]:
train.head()

Unnamed: 0,unique_hash,text,drug,sentiment
0,2e180be4c9214c1f5ab51fd8cc32bc80c9f612e0,Autoimmune diseases tend to come in clusters. ...,gilenya,2
1,9eba8f80e7e20f3a2f48685530748fbfa95943e4,I can completely understand why you’d want to ...,gilenya,2
2,fe809672251f6bd0d986e00380f48d047c7e7b76,Interesting that it only targets S1P-1/5 recep...,fingolimod,2
3,bd22104dfa9ec80db4099523e03fae7a52735eb6,"Very interesting, grand merci. Now I wonder wh...",ocrevus,2
4,b227688381f9b25e5b65109dd00f7f895e838249,"Hi everybody, My latest MRI results for Brain ...",gilenya,1


In [5]:
test.head()

Unnamed: 0,unique_hash,text,drug
0,9e9a8166b84114aca147bf409f6f956635034c08,"256 (previously stable on natalizumab), with 5...",fingolimod
1,e747e6822c867571afe7b907b51f0f2ca67b0e1a,On fingolimod and have been since December 201...,fingolimod
2,50b6d851bcff4f35afe354937949e9948975adf7,Apparently it's shingles! :-/ I do have a few ...,humira
3,7f82ec2176ae6ab0b5d20b5ffc767ac829f384ae,If the Docetaxel doing once a week x3 weeks th...,tagrisso
4,8b37d169dee5bdae27060949242fb54feb6a7f7f,"CC, Stelara worked in a matter of days for me....",stelara


In [7]:
train['text_drug'] = train['text'] +' '+train['drug']
test['text_drug'] = test['text'] +' '+test['drug']

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(norm='l2', max_features=8000, 
            strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
            ngram_range=(1,4), use_idf=True, smooth_idf=False, sublinear_tf=True,
            stop_words = 'english',max_df=0.50,min_df=0.00001,lowercase=True
                                  )

In [9]:
X = train[['text_drug']]
y = train['sentiment']

# splitting training data into train and validation datasets
from sklearn.model_selection  import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=1983,stratify= y)

In [10]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

(4751, 1)
(4751,)
(528, 1)
(528,)


In [11]:
X_train_transformed = tfidf_vectorizer.fit_transform(X_train['text_drug'])
feature_names = tfidf_vectorizer.get_feature_names()
len(feature_names)

8000

In [12]:
X_val_transformed = tfidf_vectorizer.transform(X_val['text_drug'])

In [13]:
from sklearn.naive_bayes import BernoulliNB

# instantiating bernoulli NB class
bnb=BernoulliNB()

# fitting the model
bnb.fit(X_train_transformed, y_train)

# predict class
y_pred_class = bnb.predict(X_val_transformed)

bnb

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [14]:
# printing the overall accuracy
from sklearn import metrics
from sklearn.metrics import classification_report

print(metrics.confusion_matrix(y_val, y_pred_class))
print(classification_report(y_val, y_pred_class))

[[ 15  41   6]
 [  8  68   8]
 [ 31 243 108]]
             precision    recall  f1-score   support

          0       0.28      0.24      0.26        62
          1       0.19      0.81      0.31        84
          2       0.89      0.28      0.43       382

avg / total       0.70      0.36      0.39       528



In [15]:
from sklearn.linear_model import LogisticRegression

lr_classifier = LogisticRegression(C=0.5, verbose=True,penalty='l2',class_weight='balanced')
lr_classifier

lr_classifier.fit(X_train_transformed,y_train)
# predict class
y_pred_class = lr_classifier.predict(X_val_transformed)
# Print the accuracy
metrics.accuracy_score(y_val, y_pred_class)

[LibLinear]

0.740530303030303

In [16]:
print(metrics.confusion_matrix(y_val, y_pred_class))
print(classification_report(y_val, y_pred_class))

[[  7   7  48]
 [  2  22  60]
 [  5  15 362]]
             precision    recall  f1-score   support

          0       0.50      0.11      0.18        62
          1       0.50      0.26      0.34        84
          2       0.77      0.95      0.85       382

avg / total       0.70      0.74      0.69       528



In [17]:
from sklearn.model_selection  import StratifiedKFold
from sklearn.linear_model import LogisticRegressionCV

stratefied_Kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=9999)
searchCV = LogisticRegressionCV(
        Cs= [0.001, 0.01, 0.1, 1.0] 
    #list(np.power(10.0, np.arange(-10, 10)))
        ,penalty='l2'
        ,scoring='f1'
        ,cv=stratefied_Kfold
        ,random_state=1
        ,max_iter=100
        ,fit_intercept=True
        ,class_weight='balanced'
        #,solver='newton-cg'
        ,solver = 'saga' 
        ,verbose = 2
        ,n_jobs = -1
    )

In [18]:
searchCV.fit(X_train_transformed,y_train)

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...

rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...

rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...rescaling...

rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
convergence after 16 epochs took 1 seconds
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescalin

rescaling...
rescaling...
rescaling...
convergence after 15 epochs took 1 seconds
convergence after 32 epochs took 1 seconds
convergence after 21 epochs took 1 seconds


  'precision', 'predicted', average, warn_for)


rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
convergence after 17 epochs took 1 seconds
rescaling...
rescaling...
rescaling...
rescaling...
rescalin

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...

rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...

[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed:    8.4s remaining:    2.3s



convergence after 29 epochs took 0 seconds
convergence after 27 epochs took 1 seconds


[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    8.9s finished


convergence after 38 epochs took 0 seconds
convergence after 30 epochs took 1 seconds
convergence after 33 epochs took 0 seconds


LogisticRegressionCV(Cs=[0.001, 0.01, 0.1, 1.0], class_weight='balanced',
           cv=StratifiedKFold(n_splits=3, random_state=9999, shuffle=True),
           dual=False, fit_intercept=True, intercept_scaling=1.0,
           max_iter=100, multi_class='ovr', n_jobs=-1, penalty='l2',
           random_state=1, refit=True, scoring='f1', solver='saga',
           tol=0.0001, verbose=2)

In [19]:
y_pred_class = searchCV.predict(X_val_transformed)
# Print the accuracy
metrics.accuracy_score(y_val, y_pred_class)

0.5965909090909091

In [20]:
print(metrics.confusion_matrix(y_val, y_pred_class))
print(classification_report(y_val, y_pred_class))

[[ 30  10  22]
 [ 13  42  29]
 [ 64  75 243]]
             precision    recall  f1-score   support

          0       0.28      0.48      0.36        62
          1       0.33      0.50      0.40        84
          2       0.83      0.64      0.72       382

avg / total       0.68      0.60      0.63       528



In [21]:
from sklearn.ensemble import RandomForestClassifier

RF_classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', class_weight = 'balanced', random_state = 42)
RF_classifier.fit(X_train_transformed, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='entropy', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)

In [22]:
y_pred_class = RF_classifier.predict(X_val_transformed)
# Print the accuracy
metrics.accuracy_score(y_val, y_pred_class)

0.7291666666666666

In [23]:
print(metrics.confusion_matrix(y_val, y_pred_class))
print(classification_report(y_val, y_pred_class))

[[  1   0  61]
 [  0   3  81]
 [  0   1 381]]
             precision    recall  f1-score   support

          0       1.00      0.02      0.03        62
          1       0.75      0.04      0.07        84
          2       0.73      1.00      0.84       382

avg / total       0.76      0.73      0.62       528



In [24]:
test_transformed = tfidf_vectorizer.transform(test['text_drug'])

In [25]:
y_test_class = searchCV.predict(test_transformed)

In [26]:
submission = pd.DataFrame({"unique_hash" : test['unique_hash'], "sentiment" : y_test_class})
submission.to_csv("sub_lr_cv_v2.csv", index=False)

In [27]:
submission.head()

Unnamed: 0,sentiment,unique_hash
0,2,9e9a8166b84114aca147bf409f6f956635034c08
1,1,e747e6822c867571afe7b907b51f0f2ca67b0e1a
2,2,50b6d851bcff4f35afe354937949e9948975adf7
3,2,7f82ec2176ae6ab0b5d20b5ffc767ac829f384ae
4,2,8b37d169dee5bdae27060949242fb54feb6a7f7f
