In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score,confusion_matrix,classification_report

In [2]:
data = pd.read_csv('train_new.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,0,Valsartan,Left Ventricular Dysfunction,It has no side effect I take it in combination...,9.0,"May 20, 2012",27
1,1,Guanfacine,ADHD,My son is halfway through his fourth week of I...,8.0,"April 27, 2010",192
2,2,Lybrel,Birth Control,I used to take another oral contraceptive whic...,5.0,"December 14, 2009",17
3,3,Ortho Evra,Birth Control,This is my first time using any form of birth ...,8.0,"November 3, 2015",10
4,4,Buprenorphine / naloxone,Opiate Dependence,Suboxone has completely turned my life around ...,9.0,"November 27, 2016",37


In [4]:
data.drop(['Unnamed: 0'],axis=1,inplace=True)

In [5]:
data.shape

(158584, 6)

In [6]:
conditions = dict(data['condition'].value_counts())

In [7]:
top_10 = list(conditions.keys())[0:10]

In [8]:
top_10

['Birth Control',
 'Depression',
 'Pain',
 'Anxiety',
 'Acne',
 'Bipolar Disorde',
 'Insomnia',
 'Weight Loss',
 'Obesity',
 'ADHD']

In [9]:
review_by_condition = dict(data.groupby(['condition'])['review'].value_counts())

In [10]:
len(review_by_condition)

111136

In [11]:
review_df = pd.DataFrame(review_by_condition.keys(),columns=['condition','Review'])

In [12]:
review_df

Unnamed: 0,condition,Review
0,ADHD,I am a 19 year old college student and just go...
1,ADHD,I was on this med for a little over a week Rig...
2,ADHD,Works great
3,ADHD,10 10 miracle for my ADD fog fatigue OCD Can n...
4,ADHD,12 hours 140 mg
...,...,...
111131,zen Shoulde,The only side effect I have experienced with t...
111132,zen Shoulde,This medication has been a God send for me Aft...
111133,zen Shoulde,Very helpful for my frozen shoulder pain with ...
111134,zen Shoulde,Very little relief I finished PT and after a y...


In [13]:
#filtering reviews of top 10 conditions
df = review_df[review_df.condition.isin(top_10)]

In [14]:
round(df['condition'].value_counts()/len(df)*100,2)

Birth Control      36.18
Depression         12.55
Pain                8.85
Anxiety             8.64
Acne                7.87
Bipolar Disorde     5.75
Insomnia            5.64
Weight Loss         5.26
Obesity             4.89
ADHD                4.37
Name: condition, dtype: float64

In [15]:
len(data['condition'].unique())

669

**Stemming**

In [16]:
from nltk.stem import SnowballStemmer

In [17]:
def stem(x):
    tokens = x.split()
    stemmed_tokens = [SnowballStemmer('english').stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

df['stem_rev'] = df['Review'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [18]:
df.drop(['Review'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [19]:
df = df.reset_index(drop=True)

In [20]:
df

Unnamed: 0,condition,stem_rev
0,ADHD,i am a 19 year old colleg student and just got...
1,ADHD,i was on this med for a littl over a week righ...
2,ADHD,work great
3,ADHD,10 10 miracl for my add fog fatigu ocd can now...
4,ADHD,12 hour 140 mg
...,...,...
50099,Weight Loss,so i start phentermin on tuesday 9 29 2015 at ...
50100,Weight Loss,start take 37 5 mg for weight loss on 3 31 17 ...
50101,Weight Loss,the first time i start this was back in april ...
50102,Weight Loss,updat 7 4 2011 today weight 237 for those of y...


## Modeling

In [21]:
def metrics(pred):
    print('accuracy : {} %'.format(round(accuracy_score(pred,y_test)*100,2)))
    print('recall : {} %'.format(round(recall_score(pred,y_test,average='macro')*100,2)))
    print('precision : {} %'.format(round(precision_score(pred,y_test,average='macro')*100,2)))
    print('f1 score : {} %'.format(round(f1_score(pred,y_test,average='macro')*100,2)))

In [22]:
classes = ['ADHD','Acne','Anxiety','Bipolar Disorde','BirthControl','Depression','Insomnia','Obesity','Pain','Weight Loss']
def cm(pred):
    print(confusion_matrix(y_test,pred))
    print(classification_report(y_test,pred,target_names=classes))

In [24]:
X = df['stem_rev'].values
y = df['condition'].values

In [25]:
# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=69)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
print(X_train[0])
print(y_train[0])
print(X_test[0])
print(y_test[0])

(35072,) (35072,) (15032,) (15032,)
i decid in 06 08 after i had my 3rd child to do someth more long term so i chose implanon at the time of insert i weigh 167 and now after 15 month i have been stuck at 194 i also have have been bleed for 15 month with an occasion 3 7 day break i am constant use tampon and panti liner my sex drive is aw i don t even think about it the sex drive and the bleed is put a damper on my marriag and has made my husband kranki the last 3 week i haven t bled i m hope i have either stop for good or it make me a littl curious if i am pregnant i go to the doctor on the 15th and i am go to see what other option i have
Birth Control
i bought this 2 3 month befor i got marri and had veri littl side effect i have been marri 2 5 year and i haven t gotten pregnant yet i think the onli annoy is that my period come everi other month and that i get some spot off and on throughout the month my period are shorter and there is less bleed which is a nice thing i haven t taken 

In [26]:
# Validation of train data
training_labels = set(y_train)
print(training_labels)
from scipy.stats import itemfreq
training_category_dist = itemfreq(y_train)
print(training_category_dist)

{'Birth Control', 'Anxiety', 'Obesity', 'Pain', 'Depression', 'Weight Loss', 'ADHD', 'Bipolar Disorde', 'Insomnia', 'Acne'}
[['ADHD' 1523]
 ['Acne' 2756]
 ['Anxiety' 2995]
 ['Bipolar Disorde' 2002]
 ['Birth Control' 12780]
 ['Depression' 4399]
 ['Insomnia' 1945]
 ['Obesity' 1746]
 ['Pain' 3113]
 ['Weight Loss' 1813]]


`itemfreq` is deprecated and will be removed in a future version. Use instead `np.unique(..., return_counts=True)`
  """


**MNB unigrams**

In [113]:
cv_mnb_uni = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english',token_pattern='[a-z]+',ngram_range=(1,2),lowercase=True)

In [114]:
X_train_vec_1 = cv_mnb_uni.fit_transform(X_train)

# check the content of a document vector
print(X_train_vec_1.shape)
print(X_train_vec_1[0].toarray())

# check the size of the constructed vocabulary
print(len(cv_mnb_uni.vocabulary_))

# print out the first 10 items in the vocabulary
print(list(cv_mnb_uni.vocabulary_.items())[:10])

(35072, 49601)
[[0 0 0 ... 0 0 0]]
49601
[('decid', 9835), ('rd', 34572), ('child', 6786), ('someth', 38925), ('long', 23958), ('term', 42395), ('chose', 6865), ('implanon', 20218), ('time', 43158), ('insert', 20645)]


In [115]:
X_test_vec_1 = cv_mnb_uni.transform(X_test)
print(X_test_vec_1.shape)

(15032, 49601)


In [116]:
mnb_1 = MultinomialNB()
mnb_1.fit(X_train_vec_1,y_train)
mnb_1_pred = mnb_1.predict(X_test_vec_1)

In [117]:
feature_ranks = sorted(zip(mnb_1.coef_[4], cv_mnb_uni.get_feature_names()))
category_features = feature_ranks[-10:]
print(category_features)

[(-5.0828139100790075, 'week'), (-5.058278666282728, 'control'), (-5.036792552477708, 'year'), (-4.997376351537348, 'm'), (-4.925672315793902, 'day'), (-4.8947294210318315, 've'), (-4.724801151735738, 'pill'), (-4.459985364234466, 't'), (-4.403144241309679, 'month'), (-4.367889456030126, 'period')]


In [118]:
def show_most_and_least_informative_features(vectorizer, clf, class_idx=0, n=10):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[class_idx], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[-n:])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

In [119]:
show_most_and_least_informative_features(cv_mnb_uni,mnb_1,7,30)

	-12.0002	aa             		-6.0706	weigh          
	-12.0002	ab             		-6.0521	food           
	-12.0002	abat           		-6.0012	loss           
	-12.0002	abdomen area   		-5.9767	lost lbs       
	-12.0002	abdomen pain   		-5.9364	s              
	-12.0002	abil concentr  		-5.9271	veri           
	-12.0002	abil focus     		-5.8909	exercis        
	-12.0002	abil function  		-5.8538	time           
	-12.0002	abil orgasm    		-5.8517	help           
	-12.0002	abil sleep     		-5.8098	contrav        
	-12.0002	abil stay      		-5.8037	lose           
	-12.0002	abilifi        		-5.7976	medic          
	-12.0002	abilifi becaus 		-5.7012	pill           
	-12.0002	abilifi day    		-5.6830	onli           
	-12.0002	abilifi help   		-5.6510	just           
	-12.0002	abilifi mg     		-5.6116	ve             
	-12.0002	abilifi month  		-5.4693	work           
	-12.0002	abilifi seroquel		-5.4349	feel           
	-12.0002	abilifi start  		-5.3734	pound          
	-12.0002	abilifi week   		-5.

In [78]:
metrics(mnb_1_pred)

accuracy : 79.02 %
recall : 75.11 %
precision : 69.37 %
f1 score : 71.17 %


In [79]:
cm(mnb_1_pred)

TypeError: 'numpy.ndarray' object is not callable

In [36]:
err_cnt_1 = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='Bipolar Disorde' and mnb_1_pred[i]=='Depression'):
        print(X_test[i])
        err_cnt_1 = err_cnt_1+1
print("errors:", err_cnt_1)

i ve experienc a slight increas in blood pressur but no other side effect i m feel more focus and energet
i ve been on abilifi for near 2 year the first month or so was rough tire earli at night or after a cocktail restless sleep wake up earli dizzi foggi head but after my dr reduc my dose to 2 5 mg day side effect improv the one that remain are profus night sweat soak sheet and constip it doe make you a bit tire but i take it befor bed it complet help with my irrit agit toward peopl and my children i have a ton more patienc and less anxieti overal the biggest issu i have is the cost get the abilifi save card if you are start a new prescript
i had such bad side effect from abilifi with no posit effect my cholesterol went up i had heartburn or acid reflux everi night that would wake me up i got total paranoid and was even scare to drive or ride in a car i had unusu fear of die i had a hard time lay still and would have to move my leg and arm i got realli bad dizzi spell usual while i wa

**mnb uni and bigrams**

In [37]:
cv_mnb_bi = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english',token_pattern='[a-z]+',ngram_range=(1,2),lowercase=True)

In [38]:
X_train_vec_2 = cv_mnb_bi.fit_transform(X_train)

# check the content of a document vector
print(X_train_vec_2.shape)
print(X_train_vec_2[0].toarray())

# check the size of the constructed vocabulary
print(len(cv_mnb_bi.vocabulary_))

# print out the first 10 items in the vocabulary
print(list(cv_mnb_bi.vocabulary_.items())[-10:])

(35072, 49601)
[[0 0 0 ... 0 0 0]]
49601
[('uptight', 45048), ('rang emot', 34512), ('work unless', 48483), ('antidepress antipsychot', 1927), ('risk pregnant', 36053), ('effect awesom', 12805), ('level general', 22693), ('look lose', 24148), ('s adhd', 36181), ('onli rare', 30236)]


In [39]:
X_test_vec_2 = cv_mnb_bi.transform(X_test)
print(X_test_vec_2.shape)

(15032, 49601)


In [40]:
mnb_2 = MultinomialNB()
mnb_2.fit(X_train_vec_2,y_train)
mnb_2_pred = mnb_2.predict(X_test_vec_2)

In [41]:
metrics(mnb_2_pred)

accuracy : 83.9 %
recall : 80.58 %
precision : 76.65 %
f1 score : 78.06 %


In [42]:
cm(mnb_2_pred)

[[ 519    0   31    4    5   75   15    6    8    4]
 [   0 1049    3    2  115   10    0    2    2    5]
 [   5    0  898    7   11  329   55    1   27    2]
 [   9    2   69  489   17  245   23   12   11    4]
 [   1   65   14    5 5199   37    3    7   10    6]
 [   8    3  132   58   25 1575   39    9   24   14]
 [   0    5   49    8    7   60  724    1   23    3]
 [   1    0    8    2    4   34    1  382    5  265]
 [   1    8   28    2   12   43   14    1 1214    0]
 [   0    1    3    1    9   27    2  207    9  563]]
                 precision    recall  f1-score   support

           ADHD       0.95      0.78      0.86       667
           Acne       0.93      0.88      0.90      1188
        Anxiety       0.73      0.67      0.70      1335
Bipolar Disorde       0.85      0.56      0.67       881
   BirthControl       0.96      0.97      0.97      5347
     Depression       0.65      0.83      0.73      1887
       Insomnia       0.83      0.82      0.82       880
        Obes

In [43]:
err_cnt_2 = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='Bipolar Disorde' and mnb_2_pred[i]=='Depression'):
        print(X_test[i])
        err_cnt_2 = err_cnt_2+1
print("errors:", err_cnt_2)

i ve experienc a slight increas in blood pressur but no other side effect i m feel more focus and energet
my experi with latuda is most negat i have been on it for 5 month and will be taper off in a coupl of week thank the onli good thing about it is it stabil me in a sever mix mania with psychosi episod but other than that the side effect are horribl it has done noth for my depress the past two month and i have had anxieti and major irrit on it the nausea and vomit is miser i was prescrib zofran to offset the nausea but that is not so help either sinc the zofran caus constip the 350 calori requir is a pain becaus you can t take it with dinner or you fall asleep instant so i take it with a late night meal to last a few more hour henc more weight gain
i had been on latudia 2 yrs with excel result i feel it save my life but due to be on medicar now the cost is not cover i am now on the roller coaster tri to find the right fit for my bipolar this is veri depress
it knock out extrem sad fo

**Bernoulli**

In [64]:
bool_bnb = CountVectorizer(encoding='latin-1', binary=True, min_df=5,stop_words='english',lowercase=True,ngram_range=(1,1))

In [65]:

# Initializing the BNB model and using the training data to train the BNB model

from sklearn.naive_bayes import BernoulliNB
X_train_vec_bool = bool_bnb.fit_transform(X_train)
be_clf = BernoulliNB()
bernoulliNB_clf = be_clf.fit(X_train_vec_bool, y_train)

In [66]:
X_test_vec_bol = bool_bnb.transform(X_test)

In [70]:
show_most_and_least_informative_features(bool_bnb, be_clf, class_idx=7, n=30)

	-7.4662	025            		-1.7558	weigh          
	-7.4662	1000x          		-1.7426	appetit        
	-7.4662	100mcg         		-1.7232	becaus         
	-7.4662	100mgs         		-1.7232	diet           
	-7.4662	100x           		-1.7137	don            
	-7.4662	102            		-1.6732	tri            
	-7.4662	105lbs         		-1.6551	loss           
	-7.4662	10mgs          		-1.6083	medic          
	-7.4662	10s            		-1.5829	contrav        
	-7.4662	10yr           		-1.5527	veri           
	-7.4662	110lbs         		-1.5446	pill           
	-7.4662	113            		-1.5366	time           
	-7.4662	117            		-1.5340	help           
	-7.4662	117lbs         		-1.5234	exercis        
	-7.4662	119            		-1.5104	lose           
	-7.4662	1200mg         		-1.4260	ve             
	-7.4662	123            		-1.3481	just           
	-7.4662	125lbs         		-1.3156	onli           
	-7.4662	125mg          		-1.2987	lbs            
	-7.4662	128            		-1.2476	feel           


In [62]:

# print confusion matrix for BNB Algorithm

y_pred = bernoulliNB_clf.fit(X_train_vec_bool, y_train).predict(X_test_vec_bol)
cm=confusion_matrix(y_test, y_pred)
print(cm)

[[ 381    1   28    4    2  137    8    1  102    3]
 [   0 1007    2    1  123    9    0    0   45    1]
 [   1    0  797    6    2  338   39    1  150    1]
 [   2    1   65  366   10  290   16    4  126    1]
 [   0   41   13    1 5176   41    1    1   73    0]
 [   3    1   99   25   11 1547   20    4  173    4]
 [   0    0   35    6    0   68  591    0  180    0]
 [   0    0    6    2   16   52    1  360   64  201]
 [   0    0   13    0    0   22    4    0 1284    0]
 [   1    0    5    0   16   51    1  214   53  481]]


In [63]:
metrics(y_pred)

accuracy : 79.76 %
recall : 79.01 %
precision : 69.59 %
f1 score : 71.88 %


In [50]:

# print classification report for BNB
target_names = ['ADHD','Acne','Anxiety','Bipolar Disorde','BirthControl','Depression','Insomnia','Obesity','Pain','Weight Loss']
print(classification_report(y_test, y_pred, target_names=target_names))

                 precision    recall  f1-score   support

           ADHD       0.92      0.78      0.84       667
           Acne       0.95      0.86      0.90      1188
        Anxiety       0.73      0.65      0.68      1335
Bipolar Disorde       0.79      0.67      0.72       881
   BirthControl       0.97      0.96      0.97      5347
     Depression       0.71      0.73      0.72      1887
       Insomnia       0.80      0.81      0.80       880
        Obesity       0.60      0.61      0.60       702
           Pain       0.66      0.96      0.78      1323
    Weight Loss       0.69      0.59      0.64       822

       accuracy                           0.82     15032
      macro avg       0.78      0.76      0.77     15032
   weighted avg       0.83      0.82      0.82     15032



In [51]:
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='Obesity' and y_pred[i]=='Weight Loss'):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

i am a 23 y o femal and i start take adipex about 2 year ago i start at 310 and i current weigh 209 0 in those 2 year i took adipex about 4 differ time the last time i took it was in decemb i have stop use these pill becaus a continu of them can have side effect on your heart and liver they were amaz and definit help boost my weight loss i experienc fast beat heart mood swing and main dri mouth i had dri mouth the last time i took it but i think it was a pretti good side effect becaus i drank so much water i have maintain my weight loss becaus i chang my lifestyl i now eat healthier and exercis 4x a week i have gain muscl so my weight is up i recommend these pill
just start take phentermin 6 18 16 the first day i thought i was gonnapassout from side affect but after 3 day i feel good lot of energi and decreas appetit i have experienc the dri mouth and i never realli drank water befor so this is good for me anyway i am 5 0 and weigh 164 lbs sinc im so short it consid obes but im take in

## cross val

In [66]:
#Multinomial with bigrams
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
mnb_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1', binary=False, min_df=5,ngram_range=(1,2),stop_words='english')),('nb', LinearSVC())])
scores = cross_val_score(mnb_pipe, X_train, y_train, cv=10)
avg=sum(scores)/len(scores)
print(avg)

0.8176893809709137


In [65]:
bNB_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1',binary=True,min_df=5,ngram_range=(1,1),lowercase=True)),('bernNB',BernoulliNB())])
scores = cross_val_score(bNB_pipe,X_train,y_train,cv=10)
print(sum(scores)/len(scores))

0.7993559549739094


In [52]:
##Bernoulli with Boolean Input
bNB_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1',binary=False,min_df=5)),('bernNB',BernoulliNB())])
scores = cross_val_score(bNB_pipe,X_train,y_train,cv=10)
print(sum(scores)/len(scores))

##MNB with Bool
nb_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1', binary=True,min_df=20,ngram_range=(1,2),stop_words='english')),('nb', MultinomialNB())])
scores = cross_val_score(nb_clf_pipe,X_train,y_train, cv=10)
avg=sum(scores)/len(scores)
print(avg)

##MNB with TF Uni and Bigram
nb_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1',binary=False,min_df=5,ngram_range=(1,2),stop_words='english')),('nb', MultinomialNB())])
scores = cross_val_score(nb_clf_pipe, X_train,y_train, cv=10)
avg=sum(scores)/len(scores)
print(avg)

##MNB TF Input using TFIDF Vectorizer
mNB_tf_pipe = Pipeline([('nb_tf',TfidfVectorizer(encoding='latin-1',use_idf=False,binary=False,min_df=20,ngram_range=(1,2),stop_words='english')),('nb',MultinomialNB())])
scores = cross_val_score(mNB_tf_pipe,X_train,y_train,cv=10)
print(sum(scores)/len(scores))

##MNB TFIDF Vectorizer and TFIDF Input
mNB_tfidf_pipe = Pipeline([('nb_tf',TfidfVectorizer(encoding='latin-1',use_idf=True,binary=False,min_df=20,ngram_range=(1,2),stop_words='english')),('nb',MultinomialNB())])
scores = cross_val_score(mNB_tfidf_pipe,X_train,y_train,cv=10)
print(sum(scores)/len(scores))

0.7993559549739094
0.8326868823031572
0.8399008953911691
0.7684193431023603
0.8016366842792669


## SVM

In [120]:
from sklearn.svm import LinearSVC

svm = LinearSVC(C=1)
svm.fit(X_train_vec_1,y_train)
svm_pred = svm.predict(X_test_vec_1)

In [121]:
metrics(svm_pred)

accuracy : 81.59 %
recall : 75.6 %
precision : 74.53 %
f1 score : 75.0 %


In [122]:
feature_ranks = sorted(zip(svm.coef_[0], cv_mnb_uni.get_feature_names()))

least_imp = feature_ranks[-20:]
print("least_imp")
for i in range(0, len(least_imp)):
    print(least_imp[i])
print()

## get 10 features that are least relevant to "very negative" sentiment (they are at the top of the ranked list)
most_imp = feature_ranks[:20]
print("most_imp")
for i in range(0, len(most_imp)):
    print(most_imp[i])
print()

least_imp
(0.723534175346759, 'grade')
(0.731048394487043, 'lose appetit')
(0.7377670082312041, 'gave life')
(0.7447947847728376, 'lot panic')
(0.7499840341509455, 'drug effect')
(0.7807579075563846, 'help great')
(0.7846075522124216, 'add')
(0.7916276500968731, 'calm lot')
(0.8081298056321672, 'adderal')
(0.8617946501323158, 'vyvans')
(0.8650654638089088, 'ritalin')
(0.9243491076346638, 'hour mg')
(0.9446983378783418, 'strattera')
(0.9637454155540524, 'concerta')
(0.9976665800363405, 'focus')
(1.000161172992766, 'caus migrain')
(1.0485544021203805, 'help month')
(1.0596687113702208, 'wonder like')
(1.072937635559151, 'optim')
(1.262883773213107, 'adhd')

most_imp
(-0.7221016313723163, 'life mg')
(-0.517326121713208, 'acn')
(-0.49407826519002557, 'lessen')
(-0.47485105987177634, 'focus concentr')
(-0.47279339026938993, 'seroquel')
(-0.4620536646121457, 'matter')
(-0.4473830872231505, 'pain')
(-0.4453542501443465, 'live life')
(-0.4421209857689435, 'mg didn')
(-0.43345427289933336, 'ati

In [132]:
show_most_and_least_informative_features(cv_mnb_bi, svm, class_idx=7, n=30)

	-0.5516	acn            		0.6173	alway hurt     
	-0.5367	break          		0.6205	great say      
	-0.5305	wast money     		0.6236	taken phentermin
	-0.5244	school         		0.6291	work tri       
	-0.5153	doe suppress   		0.6414	stomach alway  
	-0.5152	pound alreadi  		0.6452	drug weight    
	-0.5116	recommend drug 		0.6484	diethylpropion 
	-0.4658	extrem dri     		0.6497	phendimetrazin 
	-0.4637	lowest         		0.6527	great stop     
	-0.4545	mention        		0.6575	problem lost   
	-0.4523	frustrat       		0.6652	hcg            
	-0.4489	today lost     		0.7172	medic march    
	-0.4447	month mention  		0.7286	meridia        
	-0.4403	lbs veri       		0.7311	great alreadi  
	-0.4361	phentermin past		0.7316	great lost     
	-0.4349	great start    		0.7542	lost lot       
	-0.4346	month went     		0.7556	bontril        
	-0.4311	week lose      		0.7602	experi effect  
	-0.4246	pristiq        		0.7702	orlistat       
	-0.4228	loss week      		0.7703	just work      
	-0.4226	continu po

In [124]:
print(confusion_matrix(y_test,svm_pred))

[[ 536    1   33   12   11   42    6    5   13    8]
 [   2 1041    5    4  109   10    4    3    5    5]
 [  15    2  913   43   16  240   58    9   35    4]
 [  17    5   68  573   16  143   24   13   15    7]
 [   1   68   14   10 5177   37    4   10   23    3]
 [  39    5  214  124   46 1323   48   29   45   14]
 [   9    2   49   19    8   51  696    7   36    3]
 [   1    3   11    5   14   29    7  386   17  229]
 [   9    5   33   16   14   47   17    2 1177    3]
 [   4    3    9    6   20   29    5  290   13  443]]


In [125]:
print(classification_report(y_test,svm_pred))

                 precision    recall  f1-score   support

           ADHD       0.85      0.80      0.82       667
           Acne       0.92      0.88      0.90      1188
        Anxiety       0.68      0.68      0.68      1335
Bipolar Disorde       0.71      0.65      0.68       881
  Birth Control       0.95      0.97      0.96      5347
     Depression       0.68      0.70      0.69      1887
       Insomnia       0.80      0.79      0.80       880
        Obesity       0.51      0.55      0.53       702
           Pain       0.85      0.89      0.87      1323
    Weight Loss       0.62      0.54      0.57       822

       accuracy                           0.82     15032
      macro avg       0.76      0.75      0.75     15032
   weighted avg       0.82      0.82      0.82     15032



In [126]:
err_cnt_3 = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='Obesity' and svm_pred[i]=='Weight Loss'):
        print(X_test[i])
        err_cnt_3 = err_cnt_3+1
print("errors:", err_cnt_3)

15 day trial 11 day result start at 380lbs now 330lbs drop 3 dress size due to the decreas in the fat tumor brought on by dercum s diseas a diseas diagnos by dermatologist no known help till now in 11 day so much progress just tri to find a way to afford i feel great
i am a 23 y o femal and i start take adipex about 2 year ago i start at 310 and i current weigh 209 0 in those 2 year i took adipex about 4 differ time the last time i took it was in decemb i have stop use these pill becaus a continu of them can have side effect on your heart and liver they were amaz and definit help boost my weight loss i experienc fast beat heart mood swing and main dri mouth i had dri mouth the last time i took it but i think it was a pretti good side effect becaus i drank so much water i have maintain my weight loss becaus i chang my lifestyl i now eat healthier and exercis 4x a week i have gain muscl so my weight is up i recommend these pill
i took my first pill today spent the next 9 hour liter aslee

**SVM with bigrams**

In [127]:
svm_1 = LinearSVC(C=1)
svm_1.fit(X_train_vec_2,y_train)
svm_1_pred = svm_1.predict(X_test_vec_2)

In [97]:
show_most_and_least_informative_features(cv_mnb_bi, svm_1, class_idx=2, n=20)

	-0.9245	mcg            		0.8268	dumb           
	-0.8668	anxieti fast   		0.8293	day thank      
	-0.7775	brintellix     		0.8330	anxieti        
	-0.7709	restoril       		0.8336	ativan         
	-0.7241	bipolar        		0.8485	amaz medicin   
	-0.7090	zyprexa        		0.8677	drug good      
	-0.7047	appreci        		0.8746	alprazolam     
	-0.6982	lot panic      		0.8844	make day       
	-0.6944	norepinephrin  		0.9172	vomit hour     
	-0.6901	depress xanax  		0.9235	mellow         
	-0.6809	calm relax     		0.9307	wonder life    
	-0.6419	calm anxieti   		0.9381	experi work    
	-0.6299	acn            		0.9467	vistaril       
	-0.6228	drug veri      		0.9716	job far        
	-0.6157	adhd           		0.9809	year life      
	-0.6139	zolpidem       		1.0342	effect hate    
	-0.6013	latuda         		1.0713	lag            
	-0.5998	ani problem    		1.0719	great especi   
	-0.5991	insomnia effect		1.0862	buspar         
	-0.5964	amaz sleep     		1.2884	pill awesom    


In [130]:
feature_ranks = sorted(zip(svm_1.coef_[4], cv_mnb_bi.get_feature_names()))

least_imp = feature_ranks[:20]
print("least_imp")
for i in range(0, len(least_imp)):
    print(least_imp[i])
print()

most_imp = feature_ranks[-30:]
print("most_imp")
for i in range(0, len(most_imp)):
    print(most_imp[i])
print()

least_imp
(-0.6455977006748841, 'mg')
(-0.6062136875549552, 'paxil')
(-0.5972610946085435, 'zoloft')
(-0.5826323916081257, 'dermatologist')
(-0.564491423611585, 'lexapro')
(-0.555292027537893, 'contrav')
(-0.5404899741029264, 'doc')
(-0.5212718449885312, 'depakot')
(-0.505784423385719, 'lithium')
(-0.5038443153391449, 'far effect')
(-0.5022322507117475, 'zombi')
(-0.5009529333275559, 'celexa')
(-0.49582337656706504, 'skip')
(-0.4749735406528195, 'accutan')
(-0.4738811179154303, 'yr old')
(-0.4711975377844664, 'belviq')
(-0.4520394004609773, 'period period')
(-0.4475135827506351, 'loryna')
(-0.4433240403659657, 'prozac')
(-0.4359475594241428, 'viibryd')

most_imp
(0.671098348938928, 'nuvar')
(0.6778699807833699, 'best birth')
(0.6844723436765645, 'love recommend')
(0.6910172078501955, 'noth good')
(0.6932713749506045, 'sleepi s')
(0.6945006479355331, 'caus gain')
(0.6968784473393554, 'year effect')
(0.7042102109121352, 'effect s')
(0.7291256509630214, 'say noth')
(0.7406522194500715, 'a

In [99]:
metrics(svm_1_pred)

accuracy : 81.59 %
recall : 75.6 %
precision : 74.53 %
f1 score : 75.0 %


In [100]:
print(confusion_matrix(y_test,svm_1_pred))
print(classification_report(y_test,svm_1_pred))

[[ 536    1   33   12   11   42    6    5   13    8]
 [   2 1041    5    4  109   10    4    3    5    5]
 [  15    2  913   43   16  240   58    9   35    4]
 [  17    5   68  573   16  143   24   13   15    7]
 [   1   68   14   10 5177   37    4   10   23    3]
 [  39    5  214  124   46 1323   48   29   45   14]
 [   9    2   49   19    8   51  696    7   36    3]
 [   1    3   11    5   14   29    7  386   17  229]
 [   9    5   33   16   14   47   17    2 1177    3]
 [   4    3    9    6   20   29    5  290   13  443]]
                 precision    recall  f1-score   support

           ADHD       0.85      0.80      0.82       667
           Acne       0.92      0.88      0.90      1188
        Anxiety       0.68      0.68      0.68      1335
Bipolar Disorde       0.71      0.65      0.68       881
  Birth Control       0.95      0.97      0.96      5347
     Depression       0.68      0.70      0.69      1887
       Insomnia       0.80      0.79      0.80       880
        Obes

In [101]:
err_cnt_4 = 0
for i in range(0, len(y_test)):
    if(y_test[i]=='Obesity' and svm_1_pred[i]=='Weight Loss'):
        print(X_test[i])
        err_cnt_4 = err_cnt_4+1
print("errors:", err_cnt_4)

15 day trial 11 day result start at 380lbs now 330lbs drop 3 dress size due to the decreas in the fat tumor brought on by dercum s diseas a diseas diagnos by dermatologist no known help till now in 11 day so much progress just tri to find a way to afford i feel great
i am a 23 y o femal and i start take adipex about 2 year ago i start at 310 and i current weigh 209 0 in those 2 year i took adipex about 4 differ time the last time i took it was in decemb i have stop use these pill becaus a continu of them can have side effect on your heart and liver they were amaz and definit help boost my weight loss i experienc fast beat heart mood swing and main dri mouth i had dri mouth the last time i took it but i think it was a pretti good side effect becaus i drank so much water i have maintain my weight loss becaus i chang my lifestyl i now eat healthier and exercis 4x a week i have gain muscl so my weight is up i recommend these pill
i took my first pill today spent the next 9 hour liter aslee