In [None]:
%pip install wordcloud

In [None]:
%pip install BeautifulSoup4 

In [None]:
import pandas as pd # data preprocessing
pd.set_option('display.max_rows', None)# To show all the rows of pandas dataframe
pd.set_option('max_colwidth', None)# To set the width of the column to maximum

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

# import custom modules
from custom_libs import plotting
from custom_libs import preprocessing
from custom_libs import classification
from custom_libs import dump
from custom_libs import db

In [None]:
df = db.from_tsv_to_csv("drugsComTrain_raw")
df.head(2)

In [None]:
df.condition.value_counts()[:5]

In [None]:
df_train = df[(df['condition']=='Birth Control') | (df['condition']=='Depression') | (df['condition']=='High Blood Pressure')|(df['condition']=='Diabetes, Type 2')]
X = df_train.drop(['Unnamed: 0','drugName','rating','date','usefulCount'],axis=1)
X.head(2)

In [None]:
X_birth=X[(X['condition']=='Birth Control')]
X_dep=X[(X['condition']=='Depression')]
X_bp=X[(X['condition']=='High Blood Pressure')]
X_diab=X[(X['condition']=='Diabetes, Type 2')]

In [None]:
plotting.plot_word_cloud(X_birth, 'Birth Control')

In [None]:
#plotting.plot_word_cloud(X_dep, 'Depression')

In [None]:
#plotting.plot_word_cloud(X_bp, 'High Blood Pressure')

In [None]:
#plotting.plot_word_cloud(X_diab, 'Diabetes Type 2')

In [None]:
proprocessing_function = preprocessing.preprocess_text1
preprocessing.preprocess_dataframe(X, 'review', proprocessing_function)
X.head(2)

In [None]:
X_feat = X['review_clean']
y = X['condition']

In [None]:
class_names = ['Birth Control', 'Depression','Diabetes, Type 2','High Blood Pressure']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_feat, y, stratify=y, test_size=0.2, random_state=0)

### Count sui due classificatori

In [None]:
count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [None]:
classification.test_classifier(MultinomialNB, count_train, count_test, y_train, y_test, class_names)

In [None]:
classification.test_classifier(PassiveAggressiveClassifier, count_train, count_test, y_train, y_test, class_names)

### TFIDF sui due classificatori

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8)
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

In [None]:
classification.test_classifier(MultinomialNB, tfidf_train, tfidf_test, y_train, y_test, class_names)


In [None]:
classification.test_classifier(PassiveAggressiveClassifier, tfidf_train, tfidf_test, y_train, y_test, class_names)


### TFIDF: Bigrams solo su Passive Aggressive Classifier

In [None]:
tfidf_vectorizer2 = TfidfVectorizer(stop_words='english', max_df=0.8, ngram_range=(1,2))
tfidf_train_2 = tfidf_vectorizer2.fit_transform(X_train)
tfidf_test_2 = tfidf_vectorizer2.transform(X_test)

In [None]:
classification.test_classifier(PassiveAggressiveClassifier, tfidf_train_2, tfidf_test_2, y_train, y_test, class_names)

## TFIDF: Tringrams solo su Passive Aggressive Classifier

In [None]:
tfidf_vectorizer3 = TfidfVectorizer(stop_words='english', max_df=0.8, ngram_range=(1,3))
tfidf_train_3 = tfidf_vectorizer3.fit_transform(X_train)
tfidf_test_3 = tfidf_vectorizer3.transform(X_test)

classification.test_classifier(PassiveAggressiveClassifier, tfidf_train_3, tfidf_test_3, y_train, y_test, class_names)

## Most important Features

In [None]:
pass_tf = PassiveAggressiveClassifier()
pass_tf.fit(tfidf_train_3, y_train)

In [None]:
classification.most_informative_feature_for_class(tfidf_vectorizer2, pass_tf, 'Birth Control')

In [None]:
classification.most_informative_feature_for_class(tfidf_vectorizer2, pass_tf, 'Depression')

In [None]:
classification.most_informative_feature_for_class(tfidf_vectorizer2, pass_tf, 'High Blood Pressure')

In [None]:
classification.most_informative_feature_for_class(tfidf_vectorizer2, pass_tf, 'Diabetes, Type 2')

## Sample Predictions

In [None]:
def top_drugs_extractor(condition):
    df_top = df[(df['rating']>=9)&(df['usefulCount']>=100)].sort_values(by = ['rating', 'usefulCount'], ascending = [False, False])
    drug_lst = df_top[df_top['condition']==condition]['drugName'].head(3).tolist()
    return drug_lst

In [None]:
sentences = [
    "I have only been on Tekturna for 9 days. The effect was immediate. I am also on a calcium channel blocker (Tiazac) and hydrochlorothiazide. I was put on Tekturna because of palpitations experienced with Diovan (ugly drug in my opinion, same company produces both however). The palpitations were pretty bad on Diovan, 24 hour monitor by EKG etc. After a few days of substituting Tekturna for Diovan, there are no more palpitations.",
    "This is the third med I&#039;ve tried for anxiety and mild depression. Been on it for a week and I hate it so much. I am so dizzy, I have major diarrhea and feel worse than I started. Contacting my doc in the am and changing asap.",
    "I just got diagnosed with type 2. My doctor prescribed Invokana and metformin from the beginning. My sugars went down to normal by the second week. I am losing so much weight. No side effects yet. Miracle medicine for me",
]

In [None]:
tfidf_trigram = tfidf_vectorizer3.transform(sentences)
predictions = pass_tf.predict(tfidf_trigram)

for text, label in zip(sentences, predictions):
    if label=="High Blood Pressure":
        target="High Blood Pressure"
        top_drugs = top_drugs_extractor(label)
        print("text:", text, "\nCondition:", target)
        print("Top 3 Suggested Drugs:")
        print(top_drugs[0])
        print(top_drugs[1])
        print(top_drugs[2])
        print()
    elif label=="Depression":
        target="Depression"
        top_drugs = top_drugs_extractor(label)
        print("text:", text, "\nCondition:", target)
        print("Top 3 Suggested Drugs:")
        print(top_drugs[0])
        print(top_drugs[1])
        print(top_drugs[2])
        print()
    elif label=="Diabetes, Type 2":
        target="Diabetes, Type 2"
        top_drugs = top_drugs_extractor(label)
        print("text:", text, "\nCondition:", target)
        print("Top 3 Suggested Drugs:")
        print(top_drugs[0])
        print(top_drugs[1])
        print(top_drugs[2])
        print()
    else:
        target="Birth Control"
        print("text:", text, "\Condition:", target)
        top_drugs = top_drugs_extractor(label)
        print("text:", text, "\nCondition:", target)
        print("Top 3 Suggested Drugs:")
        print(top_drugs[0])
        print(top_drugs[1])
        print(top_drugs[2])
        print()

In [None]:
df_testsent = classification.predict_sentences(sentences, tfidf_vectorizer3, pass_tf, preprocessing.preprocess_text1)
df_testsent

In [None]:
dump.save_model(pass_tf, 'passmodel')
dump.save_vectorizer(tfidf_vectorizer3, 'countvectorizer')

In [None]:
vectorizer = dump.load_vectorizer('countvectorizer')
model = dump.load_model('passmodel')

test = model.predict(vectorizer.transform(["I have only been on Tekturna for 9 days. The effect was immediate. I am also on a calcium channel blocker (Tiazac) and hydrochlorothiazide. I was put on Tekturna because of palpitations experienced with Diovan (ugly drug in my opinion, same company produces both however). The palpitations were pretty bad on Diovan, 24 hour monitor by EKG etc. After a few days of substituting Tekturna for Diovan, there are no more palpitations"]))
test[0]