In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from textblob import TextBlob
from nltk.corpus import stopwords
from collections import Counter
import warnings; warnings.simplefilter('ignore')
import nltk
import re
import string
from string import punctuation
from nltk import ngrams
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer

In [None]:
# Importing the datasets

df_train = pd.read_csv("/kaggle/input/drug-dataset/drugsComTrain_raw.csv")
df_test = pd.read_csv("/kaggle/input/drug-dataset/drugsComTest_raw.csv") 

print ("The shape of the train set given is : ", df_train.shape)
print ("The shape of the test set given is : ", df_test.shape)

df_train.head()

In [None]:
df_train.isnull().sum()

In [None]:
df_train[df_train['condition'].isnull()]

In [None]:
df_train.dropna(subset=['condition'], inplace=True)

In [None]:
# Calculating what percentage of data is null
size = df_train.shape[0]

print ("Total Size of the dataset : ", size)

total_na = df_train.isnull().sum(axis = 0)['condition']
print ("Null values : ", total_na)

print ("PERCENTAGE : ", (total_na/size)*100)

In [None]:
# Dropping the data points with null values as it's very much less than 5% of the whole dataset
df_data = df_train.dropna(how = 'any', axis = 0)

print ("The shape of the dataset after null values removal :", df_train.shape)

In [None]:
df_train = df_data

In [None]:
df_data.sort_values(['uniqueID'], ascending = True, inplace = True)
df_data.reset_index(drop = True, inplace = True)
df_data.head(10)

In [None]:
# Converting the date in to date time format 
df_data['date'] = pd.to_datetime(df_data['date'])

In [None]:
df_data['rating'].unique().tolist()

In [None]:
# Giving the Sentiment according to the ratings
df_data['sentiment_rate'] = df_data['rating'].apply(lambda x: 1 if x > 5 else 0)

In [38]:
df_data['sentiment_rate'].value_counts()

sentiment_rate
1    112611
0     47787
Name: count, dtype: int64

In [39]:
df_data_1_sampled = df_data[df_data['sentiment_rate'] == 1].sample(n=47787, random_state=42)
df_data_0 = df_data[df_data['sentiment_rate'] == 0]
df_balanced = pd.concat([df_data_1_sampled, df_data_0])

In [40]:
df_balanced['sentiment_rate'].value_counts()

sentiment_rate
1    47787
0    47787
Name: count, dtype: int64

In [41]:
df_balanced.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,sentiment_rate
108930,157448,Guaifenesin,Cough,"""Actually I was using Mucinex to control a nag...",10,2013-05-03,36,1
113550,164085,Ibuprofen,Period Pain,"""I suffer with endometriosis. Due to this the ...",9,2014-08-17,16,1
65273,94170,Trintellix,Depression,"""I&#039;ve been on this since last June, at 5 ...",10,2015-10-31,70,1
79763,115247,Diazepam,Anxiety,"""It helps keep things peaceful.""",10,2009-05-18,20,1
106256,153490,Doxycycline,Acne,"""I&#039;m 20 years old now and I started this ...",7,2013-12-18,7,1


In [42]:
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import keras
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import math
import nltk

In [43]:
def review_clean(review): 
    # changing to lower case
    lower = review.str.lower()
    
    # Replacing the repeating pattern of &#039;
    pattern_remove = lower.str.replace("&#039;", "")
    
    # Removing all the special Characters
    special_remove = pattern_remove.str.replace(r'[^\w\d\s]',' ')
    
    # Removing all the non ASCII characters
    ascii_remove = special_remove.str.replace(r'[^\x00-\x7F]+',' ')
    
    # Removing the leading and trailing Whitespaces
    whitespace_remove = ascii_remove.str.replace(r'^\s+|\s+?$','')
    
    # Replacing multiple Spaces with Single Space
    multiw_remove = whitespace_remove.str.replace(r'\s+',' ')
    
    # Replacing Two or more dots with one
    dataframe = multiw_remove.str.replace(r'\.{2,}', ' ')
    
    return dataframe

In [44]:
# Contraction Dictionary for the expansion

contractions_dict = {
    "ain't": "am not", "aren't": "are not", "can't": "cannot", "can't've": "cannot have", "'cause": "because",
    "could've": "could have", "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not", "doesn't": "does not",
    "doesn’t": "does not", "don't": "do not", "don’t": "do not", "hadn't": "had not", "hadn't've": "had not have", "hasn't": "has not",
    "haven't": "have not", "he'd": "he had", "he'd've": "he would have", "he'll": "he will", "he'll've": "he will have", "he's": "he is",
    "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is", "i'd": "i would", "i'd've": "i would have",
    "i'll": "i will", "i'll've": "i will have", "i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have",
    "it'll": "it will", "it'll've": "it will have", "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not","might've": "might have",
    "mightn't": "might not", "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have",
    "needn't": "need not", "needn't've": "need not have", "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have",
    "shan't": "shall not","sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have",
    "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not",
    "shouldn't've": "should not have", "so've": "so have", "so's": "so is", "that'd": "that would", "that'd've": "that would have",
    "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "they'd": "they would",
    "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have",
    "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have",
    "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
    "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
    "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is",
    "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have",
    "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y’all": "you all", "y'all'd": "you all would",
    "y'all'd've": "you all would have", "y'all're": "you all are", "y'all've": "you all have", "you'd": "you would", "you'd've": "you would have",
    "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have", "ain’t": "am not", "aren’t": "are not",
    "can’t": "cannot", "can’t’ve": "cannot have", "’cause": "because", "could’ve": "could have", "couldn’t": "could not", "couldn’t’ve": "could not have",
    "didn’t": "did not", "doesn’t": "does not", "don’t": "do not", "don’t": "do not", "hadn’t": "had not", "hadn’t’ve": "had not have",
    "hasn’t": "has not", "haven’t": "have not", "he’d": "he had", "he’d’ve": "he would have", "he’ll": "he will", "he’ll’ve": "he will have",
    "he’s": "he is", "how’d": "how did", "how’d’y": "how do you", "how’ll": "how will", "how’s": "how is", "i’d": "i would", "i’d’ve": "i would have",
    "i’ll": "i will", "i’ll’ve": "i will have", "i’m": "i am", "i’ve": "i have", "isn’t": "is not", "it’d": "it would", "it’d’ve": "it would have",
    "it’ll": "it will", "it’ll’ve": "it will have", "it’s": "it is", "let’s": "let us", "ma’am": "madam", "mayn’t": "may not",
    "might’ve": "might have", "mightn’t": "might not", "mightn’t’ve": "might not have", "must’ve": "must have", "mustn’t": "must not",
    "mustn’t’ve": "must not have", "needn’t": "need not", "needn’t’ve": "need not have", "o’clock": "of the clock",
    "oughtn’t": "ought not", "oughtn’t’ve": "ought not have", "shan’t": "shall not", "sha’n’t": "shall not", "shan’t’ve": "shall not have",
    "she’d": "she would", "she’d’ve": "she would have", "she’ll": "she will", "she’ll’ve": "she will have", "she’s": "she is",
    "should’ve": "should have", "shouldn’t": "should not", "shouldn’t’ve": "should not have", "so’ve": "so have", "so’s": "so is",
    "that’d": "that would", "that’d’ve": "that would have", "that’s": "that is", "there’d": "there would", "there’d’ve": "there would have",
    "there’s": "there is", "they’d": "they would", "they’d’ve": "they would have", "they’ll": "they will", "they’ll’ve": "they will have",
    "they’re": "they are", "they’ve": "they have", "to’ve": "to have", "wasn’t": "was not", "we’d": "we would", "we’d’ve": "we would have",
    "we’ll": "we will", "we’ll’ve": "we will have", "we’re": "we are", "we’ve": "we have", "weren’t": "were not", "what’ll": "what will",
    "what’ll’ve": "what will have", "what’re": "what are", "what’s": "what is", "what’ve": "what have", "when’s": "when is",
    "when’ve": "when have", "where’d": "where did", "where’s": "where is", "where’ve": "where have", "who’ll": "who will",
    "who’ll’ve": "who will have", "who’s": "who is", "who’ve": "who have","why’s": "why is", "why’ve": "why have", "will’ve": "will have",
    "won’t": "will not", "won’t’ve": "will not have", "would’ve": "would have", "wouldn’t": "would not", "wouldn’t’ve": "would not have",
    "y’all": "you all", "y’all": "you all", "y’all’d": "you all would", "y’all’d’ve": "you all would have", "y’all’re": "you all are",
    "y’all’ve": "you all have", "you’d": "you would", "you’d’ve": "you would have", "you’ll": "you will", "you’ll’ve": "you will have",
    "you’re": "you are", "you’re": "you are", "you’ve": "you have"
}
contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function expand the contractions if there's any
def expand_contractions(s, contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, s)

In [45]:
#df_data['review_clean'] = df_data['review'].apply(review_clean)
df_data['review_clean'] = review_clean(df_data['review'])

# Expanding the contractions
df_data['review_clean'] = df_data['review_clean'].apply(lambda x: expand_contractions(x))

# Removing punctuations
df_data['review_clean'] = df_data['review_clean'].apply(lambda x: ''.join(word for word in x if word not in punctuation))

In [46]:
df_balanced.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,sentiment_rate
108930,157448,Guaifenesin,Cough,"""Actually I was using Mucinex to control a nag...",10,2013-05-03,36,1
113550,164085,Ibuprofen,Period Pain,"""I suffer with endometriosis. Due to this the ...",9,2014-08-17,16,1
65273,94170,Trintellix,Depression,"""I&#039;ve been on this since last June, at 5 ...",10,2015-10-31,70,1
79763,115247,Diazepam,Anxiety,"""It helps keep things peaceful.""",10,2009-05-18,20,1
106256,153490,Doxycycline,Acne,"""I&#039;m 20 years old now and I started this ...",7,2013-12-18,7,1


In [48]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from textblob import TextBlob
from nltk.corpus import stopwords
from collections import Counter
import warnings; warnings.simplefilter('ignore')
import nltk
import re
import string
from string import punctuation
from nltk import ngrams
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer

In [50]:
# Removing the stopwords
stop_words = set(stopwords.words('english'))
punctuation = punctuation + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
df_data['review_clean'] = df_data['review_clean'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

In [51]:
# Removing the word stems using the Snowball Stemmer
Snow_ball = SnowballStemmer("english")
df_data['review_clean'] = df_data['review_clean'].apply(lambda x: " ".join(Snow_ball.stem(word) for word in x.split()))

In [52]:
df_data['review_clean'].head(20)

0     im 21 year old recent found might pcos havent ...
1     shot 11 year month ago never 1 period even spo...
2     ive four shot point birth control pill year du...
3     total 3 shot got first one leav hospit give bi...
4     im 18 got heavi bleed ive alway heard bc make ...
5     im 19 heavi pain period sinc forev got depo sh...
6     im 30 year old woman got shot august 9th bleed...
7     im 17 year old got shot august 2015 person don...
8     first month awesom absolut wonder start light ...
9     start depo shot year ago origin bled 3 week da...
10    im 17 went depo heavi bleed like heavi would g...
11    1st shot sept 2nd nonstop bleed sinc side effe...
12    got shot 6 week post partum nurs seen high rec...
13    one inject june 2012 see gp want tri babi soon...
14    depo provera age 1518 pros pregnant period con...
15    got shot 2 half month ago stop bleed spot firs...
16    got depo shot later part septemb octob didnt p...
17    hyperplasia last year well endometriosi su

In [53]:
# Separating the day, month and year from the Date

df_data['day'] = df_data['date'].dt.day
df_data['month'] = df_data['date'].dt.month
df_data['year'] = df_data['date'].dt.year

In [54]:
def sentiment(review):
    # Sentiment polarity of the reviews
    pol = []
    for i in review:
        analysis = TextBlob(i)
        pol.append(analysis.sentiment.polarity)
    return pol

In [55]:
df_data['sentiment'] = sentiment(df_data['review'])

In [56]:
df_data['sentiment_clean'] = sentiment(df_data['review_clean'])

In [57]:
np.corrcoef(df_data['sentiment'], df_data['rating'])

array([[1.        , 0.34870057],
       [0.34870057, 1.        ]])

In [58]:
np.corrcoef(df_data['sentiment_clean'], df_data['rating'])

array([[1.        , 0.23278758],
       [0.23278758, 1.        ]])

In [59]:
# Cleaning the reviews without removing the stop words and using snowball stemmer

df_data['review_clean_ss'] = review_clean(df_data['review'])

df_data['review_clean_ss'] = df_data['review_clean_ss'].apply(lambda x: expand_contractions(x))

df_data['review_clean_ss'] = df_data['review_clean_ss'].apply(lambda x: ''.join(word for word in x if word not in punctuation))

df_data['sentiment_clean_ss'] = sentiment(df_data['review_clean_ss'])

In [60]:
np.corrcoef(df_data['sentiment_clean_ss'], df_data['rating'])

array([[1.        , 0.34540542],
       [0.34540542, 1.        ]])

In [61]:
# Label Encoding Drugname and Conditions
label_encoder_feat = {}
for feature in ['drugName', 'condition']:
    label_encoder_feat[feature] = LabelEncoder()
    df_data[feature] = label_encoder_feat[feature].fit_transform(df_data[feature])

#### **Using GloVe**

In [62]:
import numpy as np

def load_glove_embeddings(path):
    embeddings_index = {}
    with open(path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    return embeddings_index

glove_path = '/kaggle/input/glove6b100dtxt/glove.6B.100d.txt'
embeddings_index = load_glove_embeddings(glove_path)


In [63]:
texts = df_train['review'].tolist()  # Convert DataFrame column to list
labels = df_train['sentiment_rate'].values  # Get labels as an array

#### **Tokenize Data**

In [64]:
# Assuming 'texts' is a list of strings from your dataset
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=100)

# Assuming you have a binary label for each text
labels = np.array(labels)

Found 51382 unique tokens.


In [65]:
embedding_dim = 100
maxlen = 100  # Same as 'maxlen' in pad_sequences

# Preparing the GloVe embedding matrix
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

#### **Prepare the Model**

In [66]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

In [67]:
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                    embedding_dim,
                    weights=[embedding_matrix],
                    input_length=maxlen,
                    trainable=False))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

In [68]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

#### **Training Process**

In [80]:
history = model.fit(data_train, labels_train, epochs=20, batch_size=32, validation_split=0.2)

Epoch 1/20
[1m3208/3208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 7ms/step - accuracy: 0.7512 - loss: 0.5109 - val_accuracy: 0.8151 - val_loss: 0.3987
Epoch 2/20
[1m3208/3208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 6ms/step - accuracy: 0.8354 - loss: 0.3677 - val_accuracy: 0.8526 - val_loss: 0.3349
Epoch 3/20
[1m3208/3208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 6ms/step - accuracy: 0.8604 - loss: 0.3165 - val_accuracy: 0.8623 - val_loss: 0.3168
Epoch 4/20
[1m3208/3208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 6ms/step - accuracy: 0.8819 - loss: 0.2788 - val_accuracy: 0.8669 - val_loss: 0.3068
Epoch 5/20
[1m3208/3208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 6ms/step - accuracy: 0.8975 - loss: 0.2465 - val_accuracy: 0.8739 - val_loss: 0.3033
Epoch 6/20
[1m3208/3208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 7ms/step - accuracy: 0.9149 - loss: 0.2107 - val_accuracy: 0.8807 - val_loss: 0.3037
Epoch 7/20

In [81]:
predictions_prob = model.predict(data_test)
predictions = (predictions_prob > 0.5).astype(int)

[1m1003/1003[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step


#### **Classification Report**

In [82]:
report = classification_report(labels_test, predictions, target_names=['Negative', 'Positive'])
print(report)

              precision    recall  f1-score   support

    Negative       0.81      0.82      0.81      9519
    Positive       0.92      0.92      0.92     22561

    accuracy                           0.89     32080
   macro avg       0.87      0.87      0.87     32080
weighted avg       0.89      0.89      0.89     32080



#### **Accuracy**

In [83]:
final_train_accuracy = history.history['accuracy'][-1]  # Last epoch training accuracy
final_val_accuracy = history.history['val_accuracy'][-1]  # Last epoch validation accuracy

print(f'Final Training Accuracy: {final_train_accuracy:.2f}')
print(f'Final Validation Accuracy: {final_val_accuracy:.2f}')

Final Training Accuracy: 0.99
Final Validation Accuracy: 0.89


In [84]:
test_accuracy = accuracy_score(labels_test, predictions)
print(f'Test Accuracy: {test_accuracy:.5f}')

Test Accuracy: 0.88900


#### **Drugs Recommendation**

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import pandas as pd

# Contoh penggunaan TfidfVectorizer yang menghasilkan sparse matrix
data = {'condition': ['Diabetes', 'Diabetes', 'Heart Attack', 'Cold', 'Flu'],
        'description': ['High sugar levels.', 'Insulin resistance.', 'Heart pain and discomfort.', 'Running nose and sneezing.', 'High fever and chills.']}

df = pd.DataFrame(data)

# Menggunakan TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['description'])

# Hitung cosine similarity
cosine_sim = linear_kernel(X, X)

# Implementasi function rekomendasi
def recommend_condition(index, cosine_sim, top_n=2):
    sim_scores = list(enumerate(cosine_sim[index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_indices = [i[0] for i in sim_scores[1: top_n+1]]  # skip the self similarity
    return df['condition'].iloc[top_indices]

# Contoh pemanggilan fungsi
recommend_condition(0, cosine_sim)

4         Flu
1    Diabetes
Name: condition, dtype: object