In [166]:
import pandas as pd

# Load the CSV file into a DataFrame
embedded_df = pd.read_csv('embedded_review.csv')

# Display the first few rows of the DataFrame to verify it was loaded correctly
embedded_df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,lengthReview,conditionCluster_label,drugNameCluster_label,...,758,759,760,761,762,763,764,765,766,767
0,95260,Guanfacine,ADHD,My son is halfway through his fourth week of I...,8.0,27-Apr-10,192.0,712.0,2.0,4.0,...,-0.416409,-0.36404,-0.03606,0.383963,0.176255,-0.147201,-0.243359,-0.541467,0.06216,0.049585
1,92703,Lybrel,Birth Control,I used to take another oral contraceptive whic...,5.0,14-Dec-09,17.0,708.0,9.0,6.0,...,-0.279307,-0.419729,-0.389261,0.328398,0.291834,-0.027217,-0.35979,-0.706709,0.047264,-0.017902
2,138000,Ortho Evra,Birth Control,This is my first time using any form of birth ...,8.0,3-Nov-15,10.0,428.0,9.0,4.0,...,-0.232733,-0.031823,-0.032784,0.18844,0.162272,0.363399,-0.09665,-0.693634,-0.024901,0.548486
3,35696,Buprenorphine naloxone,Opiate Dependence,Suboxone has completely turned my life around ...,9.0,27-Nov-16,37.0,669.0,0.0,2.0,...,-0.310564,-0.599643,-0.375174,0.309915,0.577983,0.051811,-0.184821,-0.710691,0.065533,0.371945
4,155963,Cialis,Benign Prostatic Hyperplasia,2nd day on 5mg started to work with rock hard ...,2.0,28-Nov-15,43.0,373.0,0.0,5.0,...,-0.247983,-0.438636,-0.037911,-0.030183,0.50878,0.064493,-0.205261,-0.527391,-0.101341,0.039573


In [167]:
# I am going to drop the columns that are not embeddings of the review and the condition_cluster

columns_to_drop = [ 'uniqueID','date', 'drugName','review','lengthReview', 'conditionCluster_label']

embedded_df.drop(columns=columns_to_drop, inplace=True)

In [168]:
embedded_df.head()

Unnamed: 0,condition,rating,usefulCount,drugNameCluster_label,0,1,2,3,4,5,...,758,759,760,761,762,763,764,765,766,767
0,ADHD,8.0,192.0,4.0,-0.010977,0.010914,0.200967,-0.22949,-0.535286,0.012419,...,-0.416409,-0.36404,-0.03606,0.383963,0.176255,-0.147201,-0.243359,-0.541467,0.06216,0.049585
1,Birth Control,5.0,17.0,6.0,0.06632,0.189584,0.369006,-0.04692,-0.473988,-0.238288,...,-0.279307,-0.419729,-0.389261,0.328398,0.291834,-0.027217,-0.35979,-0.706709,0.047264,-0.017902
2,Birth Control,8.0,10.0,4.0,0.084101,-0.019134,0.294494,0.029783,-0.228783,0.170102,...,-0.232733,-0.031823,-0.032784,0.18844,0.162272,0.363399,-0.09665,-0.693634,-0.024901,0.548486
3,Opiate Dependence,9.0,37.0,2.0,0.00782,0.207558,0.179105,-0.210057,-0.197015,0.104799,...,-0.310564,-0.599643,-0.375174,0.309915,0.577983,0.051811,-0.184821,-0.710691,0.065533,0.371945
4,Benign Prostatic Hyperplasia,2.0,43.0,5.0,-0.193177,0.360585,0.448292,-0.253824,-0.532782,0.085381,...,-0.247983,-0.438636,-0.037911,-0.030183,0.50878,0.064493,-0.205261,-0.527391,-0.101341,0.039573


In [169]:
# Convert drugname into multiple columns of dummy variables

dummies_drugs = pd.get_dummies(embedded_df['condition'])

# Concatenate dummy variables with the original DataFrame
embedded_df = pd.concat([
    embedded_df.drop(columns=['condition']),
    dummies_drugs
], axis=1)

In [170]:
embedded_df.head()

Unnamed: 0,rating,usefulCount,drugNameCluster_label,0,1,2,3,4,5,6,...,mance Anxiety,min,min saxagliptin,min sitagliptin,mis,moterol,moterol mometasone,t Pac with Cyclobenzaprine cyclobenzaprine,tic mycophenolic acid,zen Shoulde
0,8.0,192.0,4.0,-0.010977,0.010914,0.200967,-0.22949,-0.535286,0.012419,0.57862,...,0,0,0,0,0,0,0,0,0,0
1,5.0,17.0,6.0,0.06632,0.189584,0.369006,-0.04692,-0.473988,-0.238288,0.341089,...,0,0,0,0,0,0,0,0,0,0
2,8.0,10.0,4.0,0.084101,-0.019134,0.294494,0.029783,-0.228783,0.170102,0.185404,...,0,0,0,0,0,0,0,0,0,0
3,9.0,37.0,2.0,0.00782,0.207558,0.179105,-0.210057,-0.197015,0.104799,0.338058,...,0,0,0,0,0,0,0,0,0,0
4,2.0,43.0,5.0,-0.193177,0.360585,0.448292,-0.253824,-0.532782,0.085381,0.607802,...,0,0,0,0,0,0,0,0,0,0


In [171]:
# When I start training the model I am getting an error messsage that NAN values are present
embedded_df = embedded_df.dropna(subset=['drugNameCluster_label'])

In [172]:
#data split step
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

from keras.utils import to_categorical

# Separate the target variable
target = embedded_df['drugNameCluster_label']

# Verify the unique values in the target variable
print(target.unique())

# Extract features (excluding the target variable)
features = embedded_df.drop(columns=['drugNameCluster_label'])

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=30)

# Convert target labels to one-hot encoded format
y_train_encoded = to_categorical(y_train, num_classes=10)
y_test_encoded = to_categorical(y_test, num_classes=10)

# Check the shapes of the one-hot encoded target labels
print("Shape of y_train_encoded:", y_train_encoded.shape)
print("Shape of y_test_encoded:", y_test_encoded.shape)


[4. 6. 2. 5. 0. 1. 7. 3. 9. 8.]
Shape of y_train_encoded: (7920, 10)
Shape of y_test_encoded: (3395, 10)


In [173]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [174]:
# Check the number of dimensions, make it a variable so it passes into

num_dimensions = embedded_df.shape[1]-1
print(num_dimensions)

1190


In [175]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
# Most of this is going to be the same as the HW assignment, but there are 10 classifications to predict now
# So the output layer has been adjusted

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import regularizers

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=num_dimensions, activation="relu", input_dim=num_dimensions))

# Second hidden layer
nn.add(Dense(units=600, activation='leaky_relu'))

# Third hidden layer (example of adding an additional hidden layer)
nn.add(Dense(units=512, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

#Fourth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=256, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

#Fifth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=256, activation='leaky_relu', kernel_regularizer=regularizers.l2(0.001)))

#Sixth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=128, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

#Seventh hidden layer (example of adding another hidden layer)
nn.add(Dense(units=64, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

#Eighth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=15, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

# Output layer
nn.add(Dense(units=10, activation='softmax'))  # 10 units for 10 classes, softmax activation

# Check the structure of the model
nn.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_50 (Dense)            (None, 1190)              1417290   
                                                                 
 dense_51 (Dense)            (None, 600)               714600    
                                                                 
 dense_52 (Dense)            (None, 512)               307712    
                                                                 
 dense_53 (Dense)            (None, 256)               131328    
                                                                 
 dense_54 (Dense)            (None, 256)               65792     
                                                                 
 dense_55 (Dense)            (None, 128)               32896     
                                                                 
 dense_56 (Dense)            (None, 64)               

In [177]:
# Compile the model - the loss function is categorical and not for binary classification

nn.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [178]:
 # Fit the model to the training data
fit_model = nn.fit(X_train_scaled, y_train_encoded, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [179]:
test_loss, test_accuracy = nn.evaluate(X_test_scaled, y_test_encoded)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

Test Loss: 2.5134689807891846
Test Accuracy: 0.4088365137577057


In [None]:
!pip install keras-tuner
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from kerastuner.tuners import RandomSearch

# Define the model-building function
def build_model(hp):
    model = keras.Sequential()
    model.add(layers.Dense(units=hp.Int('units_input', min_value=64, max_value=1024, step=64), activation='relu', input_dim=num_dimensions))
    for i in range(hp.Int('num_layers', 1, 8)):  # Number of hidden layers
        model.add(layers.Dense(units=hp.Int('units_' + str(i), min_value=64, max_value=512, step=64), activation='relu'))
    model.add(layers.Dense(10, activation='softmax'))
    model.compile(
        optimizer=keras.optimizers.Adam(
            hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
        loss='categorical_crossentropy',
        metrics=['accuracy'])
    return model

# Instantiate the tuner and perform hyperparameter search
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=5,
    executions_per_trial=3,
    directory='my_dir',
    project_name='helloworld')

tuner.search(X_train_scaled, y_train_encoded, epochs=5, validation_split=0.2)

# Get the best hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"Optimal number of units in the input layer: {best_hps.get('units_input')}")
print(f"Optimal learning rate for the optimizer: {best_hps.get('learning_rate')}")
print(f"Optimal number of hidden layers: {best_hps.get('num_layers')}")
for i in range(best_hps.get('num_layers')):
    print(f"Optimal number of units in layer {i}: {best_hps.get('units_' + str(i))}")

# Build the model with the best hyperparameters and train it on the data
model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train_scaled, y_train_encoded, epochs=50, validation_split=0.2)

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test_scaled, y_test_encoded)
print("Test accuracy:", accuracy)

Reloading Tuner from my_dir/helloworld/tuner0.json
Optimal number of units in the input layer: 64
Optimal learning rate for the optimizer: 0.001
Optimal number of hidden layers: 1
Optimal number of units in layer 0: 128
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test accuracy: 0.29602357745170593


In [None]:
!pip install scikeras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, InputLayer
from tensorflow.keras.optimizers import Adam, RMSprop
from sklearn.model_selection import GridSearchCV
from scikeras.wrappers import KerasClassifier
from tensorflow.keras import regularizers
import tensorflow as tf


def create_model2(units=1190, activation='sigmoid', kernel_initializer='glorot_uniform', optimizer='adam', learning_rate=0.001, loss='binary_crossentropy'):
    model = Sequential()
    model.add(InputLayer(input_shape=(1190,)))  # Assuming 771 input features, adjust accordingly
    model.add(Dense(units=600, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dense(units=512, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dense(units=256, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dense(units=256, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dense(units=128, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dense(units=64, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dense(units=15, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
    model.add(Dense(units=10, activation='softmax'))


   # Compile the model - the loss function is categorical and not for binary classification
    custom_optimizer = Adam(learning_rate=0.001)
    model.compile(loss="categorical_crossentropy", optimizer=custom_optimizer, metrics=["accuracy"])

    #model.compile(optimizer=adam', loss=loss, metrics=['binary_accuracy'])
    return model

# Define hyperparameters to search
hyperparameters = {
    'units': [32, 64, 128, 256, 512 ],
    'optimizer': ['adam',],
    'learning_rate': [0.001, 0.01, 0.1],

}

# Construct the KerasClassifier with the create_model function and other necessary parameters
model = KerasClassifier(build_fn=create_model2, verbose=0, learning_rate=0.001, units=1190)




In [None]:

# Create GridSearchCV instance
grid = GridSearchCV(estimator=model, param_grid=hyperparameters, scoring='accuracy', error_score='raise')

In [None]:
# Fit the grid search
grid_result = grid.fit(X_train_scaled, y_train_encoded, epochs=10)

  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y =

In [None]:
best_model = grid_result.best_estimator_
best_model

In [None]:
test_accuracy = best_model.score(X_test_scaled, y_test_encoded)
print('Test accuracy:', test_accuracy)

Test accuracy: 0.40706921944035346


In [None]:
Trying with reduced csv file

In [36]:
import pandas as pd

# Load the review embeddings CSV file into a DataFrame
embedded_df = pd.read_csv('embedded_review_reduced.csv')

# Display the first few rows of the DataFrame to verify it was loaded correctly
embedded_df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,lengthReview,conditionCluster_label,drugNameCluster_label,...,335,336,337,338,339,340,341,342,343,344
0,95260,Guanfacine,ADHD,My son is halfway through his fourth week of I...,8.0,27-Apr-10,192.0,712.0,2.0,4.0,...,-0.453633,0.485813,-0.740961,0.156234,-0.318109,0.031853,0.171373,0.346318,-0.322281,0.315132
1,92703,Lybrel,Birth Control,I used to take another oral contraceptive whic...,5.0,14-Dec-09,17.0,708.0,9.0,6.0,...,-0.123561,0.16135,-0.510392,-0.247828,0.680442,0.110699,0.259813,0.115464,0.135992,-0.302668
2,138000,Ortho Evra,Birth Control,This is my first time using any form of birth ...,8.0,3-Nov-15,10.0,428.0,9.0,4.0,...,0.43079,-0.6669,0.209315,-0.155577,0.017546,0.11231,-0.315534,0.069992,0.964049,0.108445
3,35696,Buprenorphine naloxone,Opiate Dependence,Suboxone has completely turned my life around ...,9.0,27-Nov-16,37.0,669.0,0.0,2.0,...,-0.429121,-0.478984,0.170533,-0.227519,-0.690834,0.326514,0.49276,0.31636,0.86215,0.724597
4,155963,Cialis,Benign Prostatic Hyperplasia,2nd day on 5mg started to work with rock hard ...,2.0,28-Nov-15,43.0,373.0,0.0,5.0,...,-0.211574,-0.325883,0.51398,-0.116952,-0.819433,-0.577145,0.262017,-0.448657,-0.025434,-0.154147


In [37]:
# Droping  the non-beneficial and non-embedded columns

columns_to_drop = [ 'uniqueID','date', 'drugName','review','lengthReview', 'conditionCluster_label']

embedded_df.drop(columns=columns_to_drop, inplace=True)

In [39]:
embedded_df.head()

Unnamed: 0,condition,rating,usefulCount,drugNameCluster_label,0,1,2,3,4,5,...,335,336,337,338,339,340,341,342,343,344
0,ADHD,8.0,192.0,4.0,6.256752,0.08955,-15.717361,7.26135,0.713106,-1.665947,...,-0.453633,0.485813,-0.740961,0.156234,-0.318109,0.031853,0.171373,0.346318,-0.322281,0.315132
1,Birth Control,5.0,17.0,6.0,-3.616027,-1.046721,-3.250082,-6.424401,1.245648,0.238435,...,-0.123561,0.16135,-0.510392,-0.247828,0.680442,0.110699,0.259813,0.115464,0.135992,-0.302668
2,Birth Control,8.0,10.0,4.0,-5.94492,-0.564254,-3.821374,-2.283092,-7.821428,-1.08098,...,0.43079,-0.6669,0.209315,-0.155577,0.017546,0.11231,-0.315534,0.069992,0.964049,0.108445
3,Opiate Dependence,9.0,37.0,2.0,1.624358,0.118632,-2.822628,-9.034972,-0.346689,1.570179,...,-0.429121,-0.478984,0.170533,-0.227519,-0.690834,0.326514,0.49276,0.31636,0.86215,0.724597
4,Benign Prostatic Hyperplasia,2.0,43.0,5.0,-4.455269,-1.187546,3.13366,-4.596779,-2.269232,-5.121815,...,-0.211574,-0.325883,0.51398,-0.116952,-0.819433,-0.577145,0.262017,-0.448657,-0.025434,-0.154147


In [40]:
# Convert drugname into multiple columns of dummy variables

dummies_drugs = pd.get_dummies(embedded_df['condition'])

# Concatenate dummy variables with the original DataFrame
embedded_df = pd.concat([
    embedded_df.drop(columns=['condition']),
    dummies_drugs
], axis=1)

In [41]:
embedded_df.head()


Unnamed: 0,rating,usefulCount,drugNameCluster_label,0,1,2,3,4,5,6,...,mance Anxiety,min,min saxagliptin,min sitagliptin,mis,moterol,moterol mometasone,t Pac with Cyclobenzaprine cyclobenzaprine,tic mycophenolic acid,zen Shoulde
0,8.0,192.0,4.0,6.256752,0.08955,-15.717361,7.26135,0.713106,-1.665947,-5.037073,...,0,0,0,0,0,0,0,0,0,0
1,5.0,17.0,6.0,-3.616027,-1.046721,-3.250082,-6.424401,1.245648,0.238435,1.832553,...,0,0,0,0,0,0,0,0,0,0
2,8.0,10.0,4.0,-5.94492,-0.564254,-3.821374,-2.283092,-7.821428,-1.08098,1.283532,...,0,0,0,0,0,0,0,0,0,0
3,9.0,37.0,2.0,1.624358,0.118632,-2.822628,-9.034972,-0.346689,1.570179,0.553306,...,0,0,0,0,0,0,0,0,0,0
4,2.0,43.0,5.0,-4.455269,-1.187546,3.13366,-4.596779,-2.269232,-5.121815,-2.388084,...,0,0,0,0,0,0,0,0,0,0


In [42]:
# When I start training the model I am getting an error messsage that NAN values are present
embedded_df = embedded_df.dropna(subset=['drugNameCluster_label'])

In [43]:
#data split step
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

from keras.utils import to_categorical

# Separate the target variable
target = embedded_df['drugNameCluster_label']

# Verify the unique values in the target variable
print(target.unique())

# Extract features (excluding the target variable)
features = embedded_df.drop(columns=['drugNameCluster_label'])

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=30)

# Convert target labels to one-hot encoded format
y_train_encoded = to_categorical(y_train, num_classes=10)
y_test_encoded = to_categorical(y_test, num_classes=10)

# Check the shapes of the one-hot encoded target labels
print("Shape of y_train_encoded:", y_train_encoded.shape)
print("Shape of y_test_encoded:", y_test_encoded.shape)

[4. 6. 2. 5. 0. 1. 7. 3. 9. 8.]
Shape of y_train_encoded: (7920, 10)
Shape of y_test_encoded: (3395, 10)


In [44]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [45]:
# Check the number of dimensions, make it a variable so it passes into

num_dimensions = embedded_df.shape[1]-1
print(num_dimensions)

767


In [46]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
# Most of this is going to be the same as the HW assignment, but there are 10 classifications to predict now
# So the output layer has been adjusted

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import regularizers

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=num_dimensions, activation="relu", input_dim=num_dimensions))

# Second hidden layer
nn.add(Dense(units=600, activation='leaky_relu'))

# Third hidden layer (example of adding an additional hidden layer)
nn.add(Dense(units=512, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

#Fourth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=256, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

#Fifth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=256, activation='leaky_relu', kernel_regularizer=regularizers.l2(0.001)))

#Sixth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=128, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

#Seventh hidden layer (example of adding another hidden layer)
nn.add(Dense(units=64, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

#Eighth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=15, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

# Output layer
nn.add(Dense(units=10, activation='softmax'))  # 10 units for 10 classes, softmax activation

# Check the structure of the model
nn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_9 (Dense)             (None, 767)               589056    
                                                                 
 dense_10 (Dense)            (None, 600)               460800    
                                                                 
 dense_11 (Dense)            (None, 512)               307712    
                                                                 
 dense_12 (Dense)            (None, 256)               131328    
                                                                 
 dense_13 (Dense)            (None, 256)               65792     
                                                                 
 dense_14 (Dense)            (None, 128)               32896     
                                                                 
 dense_15 (Dense)            (None, 64)               

In [47]:
# Compile the model - the loss function is categorical and not for binary classification

nn.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [52]:
 # Fit the model to the training data
fit_model = nn.fit(X_train_scaled, y_train_encoded, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [53]:
test_loss, test_accuracy = nn.evaluate(X_test_scaled, y_test_encoded)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

Test Loss: 3.980900526046753
Test Accuracy: 0.4108983874320984


In [None]:
# Trying to predict DrugName using sentiment analysis, review embedding and condition embedding

In [180]:
import pandas as pd

# Load the review embeddings CSV file into a DataFrame
embedded_df = pd.read_csv('embedded_review_reduced.csv')

# Display the first few rows of the DataFrame to verify it was loaded correctly
embedded_df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,lengthReview,conditionCluster_label,drugNameCluster_label,...,335,336,337,338,339,340,341,342,343,344
0,95260,Guanfacine,ADHD,My son is halfway through his fourth week of I...,8.0,27-Apr-10,192.0,712.0,2.0,4.0,...,-0.453633,0.485813,-0.740961,0.156234,-0.318109,0.031853,0.171373,0.346318,-0.322281,0.315132
1,92703,Lybrel,Birth Control,I used to take another oral contraceptive whic...,5.0,14-Dec-09,17.0,708.0,9.0,6.0,...,-0.123561,0.16135,-0.510392,-0.247828,0.680442,0.110699,0.259813,0.115464,0.135992,-0.302668
2,138000,Ortho Evra,Birth Control,This is my first time using any form of birth ...,8.0,3-Nov-15,10.0,428.0,9.0,4.0,...,0.43079,-0.6669,0.209315,-0.155577,0.017546,0.11231,-0.315534,0.069992,0.964049,0.108445
3,35696,Buprenorphine naloxone,Opiate Dependence,Suboxone has completely turned my life around ...,9.0,27-Nov-16,37.0,669.0,0.0,2.0,...,-0.429121,-0.478984,0.170533,-0.227519,-0.690834,0.326514,0.49276,0.31636,0.86215,0.724597
4,155963,Cialis,Benign Prostatic Hyperplasia,2nd day on 5mg started to work with rock hard ...,2.0,28-Nov-15,43.0,373.0,0.0,5.0,...,-0.211574,-0.325883,0.51398,-0.116952,-0.819433,-0.577145,0.262017,-0.448657,-0.025434,-0.154147


In [181]:
# Read the review_sentiment CSV
sentiment_df = pd.read_csv('reviews_sentiments.csv')

# Display the first few rows of the DataFrame to verify it was loaded correctly
sentiment_df.head()

Unnamed: 0,uniqueID,review,sentiment
0,95260,My son is halfway through his fourth week of I...,POSITIVE
1,92703,I used to take another oral contraceptive whic...,NEGATIVE
2,138000,This is my first time using any form of birth ...,NEGATIVE
3,35696,Suboxone has completely turned my life around ...,POSITIVE
4,155963,2nd day on 5mg started to work with rock hard ...,NEGATIVE


In [182]:
#update the sentiment labels

sentiment_df['sentiment'].replace('POSITIVE', 0, inplace =True)

sentiment_df['sentiment'].replace('NEGATIVE', 1, inplace =True)

sentiment_df['sentiment'].replace('NEUTRAL', 2, inplace =True)

sentiment_df.head()

Unnamed: 0,uniqueID,review,sentiment
0,95260,My son is halfway through his fourth week of I...,0
1,92703,I used to take another oral contraceptive whic...,1
2,138000,This is my first time using any form of birth ...,1
3,35696,Suboxone has completely turned my life around ...,0
4,155963,2nd day on 5mg started to work with rock hard ...,1


In [183]:
# Records for each sentiments.POstive and negative labels are almost balanced, but very few records with neutral label.
sentiment_df['sentiment'].value_counts()

0    6294
1    4953
2      68
Name: sentiment, dtype: int64

In [184]:
# drop the review column from sentiment_df
sentiment_df.drop('review', axis=1, inplace=True)

In [185]:
sentiment_df.head()

Unnamed: 0,uniqueID,sentiment
0,95260,0
1,92703,1
2,138000,1
3,35696,0
4,155963,1


In [186]:
embedded_sentiment_df = pd.merge(embedded_df, sentiment_df, on='uniqueID')

embedded_sentiment_df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,lengthReview,conditionCluster_label,drugNameCluster_label,...,336,337,338,339,340,341,342,343,344,sentiment
0,95260,Guanfacine,ADHD,My son is halfway through his fourth week of I...,8.0,27-Apr-10,192.0,712.0,2.0,4.0,...,0.485813,-0.740961,0.156234,-0.318109,0.031853,0.171373,0.346318,-0.322281,0.315132,0
1,92703,Lybrel,Birth Control,I used to take another oral contraceptive whic...,5.0,14-Dec-09,17.0,708.0,9.0,6.0,...,0.16135,-0.510392,-0.247828,0.680442,0.110699,0.259813,0.115464,0.135992,-0.302668,1
2,138000,Ortho Evra,Birth Control,This is my first time using any form of birth ...,8.0,3-Nov-15,10.0,428.0,9.0,4.0,...,-0.6669,0.209315,-0.155577,0.017546,0.11231,-0.315534,0.069992,0.964049,0.108445,1
3,35696,Buprenorphine naloxone,Opiate Dependence,Suboxone has completely turned my life around ...,9.0,27-Nov-16,37.0,669.0,0.0,2.0,...,-0.478984,0.170533,-0.227519,-0.690834,0.326514,0.49276,0.31636,0.86215,0.724597,0
4,155963,Cialis,Benign Prostatic Hyperplasia,2nd day on 5mg started to work with rock hard ...,2.0,28-Nov-15,43.0,373.0,0.0,5.0,...,-0.325883,0.51398,-0.116952,-0.819433,-0.577145,0.262017,-0.448657,-0.025434,-0.154147,1


In [187]:
# Total number of records in the dataframe
len(embedded_sentiment_df)

11315

In [188]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

In [155]:
embedded_sentiment_df.columns

Index(['uniqueID', 'drugName', 'condition', 'review', 'rating', 'date',
       'usefulCount', 'lengthReview', 'conditionCluster_label',
       'drugNameCluster_label',
       ...
       '336', '337', '338', '339', '340', '341', '342', '343', '344',
       'sentiment'],
      dtype='object', length=356)

In [189]:
# Convert drugname into multiple columns of dummy variables

dummies_drugs = pd.get_dummies(embedded_sentiment_df['condition'])

# Concatenate dummy variables with the original DataFrame
embedded_sentiment_df = pd.concat([
    embedded_sentiment_df.drop(columns=['condition']),
    dummies_drugs
], axis=1)

In [190]:
embedded_sentiment_df.head()

Unnamed: 0,uniqueID,drugName,review,rating,date,usefulCount,lengthReview,conditionCluster_label,drugNameCluster_label,0,...,mance Anxiety,min,min saxagliptin,min sitagliptin,mis,moterol,moterol mometasone,t Pac with Cyclobenzaprine cyclobenzaprine,tic mycophenolic acid,zen Shoulde
0,95260,Guanfacine,My son is halfway through his fourth week of I...,8.0,27-Apr-10,192.0,712.0,2.0,4.0,6.256752,...,0,0,0,0,0,0,0,0,0,0
1,92703,Lybrel,I used to take another oral contraceptive whic...,5.0,14-Dec-09,17.0,708.0,9.0,6.0,-3.616027,...,0,0,0,0,0,0,0,0,0,0
2,138000,Ortho Evra,This is my first time using any form of birth ...,8.0,3-Nov-15,10.0,428.0,9.0,4.0,-5.94492,...,0,0,0,0,0,0,0,0,0,0
3,35696,Buprenorphine naloxone,Suboxone has completely turned my life around ...,9.0,27-Nov-16,37.0,669.0,0.0,2.0,1.624358,...,0,0,0,0,0,0,0,0,0,0
4,155963,Cialis,2nd day on 5mg started to work with rock hard ...,2.0,28-Nov-15,43.0,373.0,0.0,5.0,-4.455269,...,0,0,0,0,0,0,0,0,0,0


In [191]:
# Drop all the columns that are not embeddings of the review and the sentiment labels.

columns_to_drop = ['uniqueID', 'drugName', 'review', 'date', 'lengthReview', 'conditionCluster_label']

embedded_sentiment_df_new = embedded_sentiment_df.drop(columns=columns_to_drop)
embedded_sentiment_df_new.head(5)

Unnamed: 0,rating,usefulCount,drugNameCluster_label,0,1,2,3,4,5,6,...,mance Anxiety,min,min saxagliptin,min sitagliptin,mis,moterol,moterol mometasone,t Pac with Cyclobenzaprine cyclobenzaprine,tic mycophenolic acid,zen Shoulde
0,8.0,192.0,4.0,6.256752,0.08955,-15.717361,7.26135,0.713106,-1.665947,-5.037073,...,0,0,0,0,0,0,0,0,0,0
1,5.0,17.0,6.0,-3.616027,-1.046721,-3.250082,-6.424401,1.245648,0.238435,1.832553,...,0,0,0,0,0,0,0,0,0,0
2,8.0,10.0,4.0,-5.94492,-0.564254,-3.821374,-2.283092,-7.821428,-1.08098,1.283532,...,0,0,0,0,0,0,0,0,0,0
3,9.0,37.0,2.0,1.624358,0.118632,-2.822628,-9.034972,-0.346689,1.570179,0.553306,...,0,0,0,0,0,0,0,0,0,0
4,2.0,43.0,5.0,-4.455269,-1.187546,3.13366,-4.596779,-2.269232,-5.121815,-2.388084,...,0,0,0,0,0,0,0,0,0,0


In [192]:
# Split our preprocessed data into our features and target arrays

from keras.utils import to_categorical

# Separate the target variable
target = embedded_sentiment_df_new['drugNameCluster_label']

# Verify the unique values in the target variable
print(target.unique())

# Extract features (excluding the target variable)
features = embedded_sentiment_df_new.drop(columns=['drugNameCluster_label'])

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=30)

# Convert target labels to one-hot encoded format
y_train_encoded = to_categorical(y_train, num_classes=10)
y_test_encoded = to_categorical(y_test, num_classes=10)

# Check the shapes of the one-hot encoded target labels
print("Shape of y_train_encoded:", y_train_encoded.shape)
print("Shape of y_test_encoded:", y_test_encoded.shape)

[4. 6. 2. 5. 0. 1. 7. 3. 9. 8.]
Shape of y_train_encoded: (7920, 10)
Shape of y_test_encoded: (3395, 10)


In [193]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [194]:
# Check the number of dimensions, make it a variable so it passes into

num_dimensions = embedded_sentiment_df_new.shape[1]-1
print(num_dimensions)

768


In [195]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
# 3 classifications to predict now


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

nn = tf.keras.models.Sequential()

# Input layer
nn.add(tf.keras.layers.Dense(units=768, activation="sigmoid", input_dim=num_dimensions))

# Second hidden layer
nn.add(Dense(units=512, activation='relu'))

# Third hidden layer (example of adding an additional hidden layer)
nn.add(Dense(units=256, activation='relu'))

#Fourth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=256, activation='relu'))

#Fifth hidden layer (example of adding another hidden layer)
# nn.add(Dense(units=100, activation='relu'))

#Sixth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=50, activation='relu'))

#Seventh hidden layer (example of adding another hidden layer)
nn.add(Dense(units=25, activation='relu'))

#Eighth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=15, activation='relu'))

# Output layer
nn.add(Dense(units=10, activation='softmax'))  # 3 units for 3 classes, softmax activation

# Check the structure of the model
nn.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_59 (Dense)            (None, 768)               590592    
                                                                 
 dense_60 (Dense)            (None, 512)               393728    
                                                                 
 dense_61 (Dense)            (None, 256)               131328    
                                                                 
 dense_62 (Dense)            (None, 256)               65792     
                                                                 
 dense_63 (Dense)            (None, 50)                12850     
                                                                 
 dense_64 (Dense)            (None, 25)                1275      
                                                                 
 dense_65 (Dense)            (None, 15)               

In [196]:
#compile the model
nn.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [197]:
 # Fit the model to the training data
fit_model = nn.fit(X_train_scaled, y_train_encoded, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [198]:
test_loss, test_accuracy = nn.evaluate(X_test_scaled, y_test_encoded)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

Test Loss: 1.7029719352722168
Test Accuracy: 0.43652430176734924


In [None]:
# Predicting  DrugNAme Cluster_label (target) using review embedding, sentiment analysis, rating, usefulCount, condition dummy variables and drugNAme dummy variables (fetaures)

In [199]:
import pandas as pd

# Load the review embeddings CSV file into a DataFrame
embedded_df = pd.read_csv('embedded_review_reduced.csv')

# Display the first few rows of the DataFrame to verify it was loaded correctly
embedded_df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,lengthReview,conditionCluster_label,drugNameCluster_label,...,335,336,337,338,339,340,341,342,343,344
0,95260,Guanfacine,ADHD,My son is halfway through his fourth week of I...,8.0,27-Apr-10,192.0,712.0,2.0,4.0,...,-0.453633,0.485813,-0.740961,0.156234,-0.318109,0.031853,0.171373,0.346318,-0.322281,0.315132
1,92703,Lybrel,Birth Control,I used to take another oral contraceptive whic...,5.0,14-Dec-09,17.0,708.0,9.0,6.0,...,-0.123561,0.16135,-0.510392,-0.247828,0.680442,0.110699,0.259813,0.115464,0.135992,-0.302668
2,138000,Ortho Evra,Birth Control,This is my first time using any form of birth ...,8.0,3-Nov-15,10.0,428.0,9.0,4.0,...,0.43079,-0.6669,0.209315,-0.155577,0.017546,0.11231,-0.315534,0.069992,0.964049,0.108445
3,35696,Buprenorphine naloxone,Opiate Dependence,Suboxone has completely turned my life around ...,9.0,27-Nov-16,37.0,669.0,0.0,2.0,...,-0.429121,-0.478984,0.170533,-0.227519,-0.690834,0.326514,0.49276,0.31636,0.86215,0.724597
4,155963,Cialis,Benign Prostatic Hyperplasia,2nd day on 5mg started to work with rock hard ...,2.0,28-Nov-15,43.0,373.0,0.0,5.0,...,-0.211574,-0.325883,0.51398,-0.116952,-0.819433,-0.577145,0.262017,-0.448657,-0.025434,-0.154147


In [200]:
# Read the review_sentiment CSV
sentiment_df = pd.read_csv('reviews_sentiments.csv')

# Display the first few rows of the DataFrame to verify it was loaded correctly
sentiment_df.head()

Unnamed: 0,uniqueID,review,sentiment
0,95260,My son is halfway through his fourth week of I...,POSITIVE
1,92703,I used to take another oral contraceptive whic...,NEGATIVE
2,138000,This is my first time using any form of birth ...,NEGATIVE
3,35696,Suboxone has completely turned my life around ...,POSITIVE
4,155963,2nd day on 5mg started to work with rock hard ...,NEGATIVE


In [201]:
#update the sentiment labels

sentiment_df['sentiment'].replace('POSITIVE', 0, inplace =True)

sentiment_df['sentiment'].replace('NEGATIVE', 1, inplace =True)

sentiment_df['sentiment'].replace('NEUTRAL', 2, inplace =True)

sentiment_df.head()

Unnamed: 0,uniqueID,review,sentiment
0,95260,My son is halfway through his fourth week of I...,0
1,92703,I used to take another oral contraceptive whic...,1
2,138000,This is my first time using any form of birth ...,1
3,35696,Suboxone has completely turned my life around ...,0
4,155963,2nd day on 5mg started to work with rock hard ...,1


In [202]:
# drop the review column from sentiment_df
sentiment_df.drop('review', axis=1, inplace=True)
sentiment_df.head()

Unnamed: 0,uniqueID,sentiment
0,95260,0
1,92703,1
2,138000,1
3,35696,0
4,155963,1


In [203]:
# merge embedded_df and sentiment_df
embedded_sentiment_df = pd.merge(embedded_df, sentiment_df, on='uniqueID')

embedded_sentiment_df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,lengthReview,conditionCluster_label,drugNameCluster_label,...,336,337,338,339,340,341,342,343,344,sentiment
0,95260,Guanfacine,ADHD,My son is halfway through his fourth week of I...,8.0,27-Apr-10,192.0,712.0,2.0,4.0,...,0.485813,-0.740961,0.156234,-0.318109,0.031853,0.171373,0.346318,-0.322281,0.315132,0
1,92703,Lybrel,Birth Control,I used to take another oral contraceptive whic...,5.0,14-Dec-09,17.0,708.0,9.0,6.0,...,0.16135,-0.510392,-0.247828,0.680442,0.110699,0.259813,0.115464,0.135992,-0.302668,1
2,138000,Ortho Evra,Birth Control,This is my first time using any form of birth ...,8.0,3-Nov-15,10.0,428.0,9.0,4.0,...,-0.6669,0.209315,-0.155577,0.017546,0.11231,-0.315534,0.069992,0.964049,0.108445,1
3,35696,Buprenorphine naloxone,Opiate Dependence,Suboxone has completely turned my life around ...,9.0,27-Nov-16,37.0,669.0,0.0,2.0,...,-0.478984,0.170533,-0.227519,-0.690834,0.326514,0.49276,0.31636,0.86215,0.724597,0
4,155963,Cialis,Benign Prostatic Hyperplasia,2nd day on 5mg started to work with rock hard ...,2.0,28-Nov-15,43.0,373.0,0.0,5.0,...,-0.325883,0.51398,-0.116952,-0.819433,-0.577145,0.262017,-0.448657,-0.025434,-0.154147,1


In [204]:
# Total number of records in the dataframe
len(embedded_sentiment_df)

11315

In [205]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

In [206]:
#Review embedded_sentiment_df
embedded_sentiment_df.columns

Index(['uniqueID', 'drugName', 'condition', 'review', 'rating', 'date',
       'usefulCount', 'lengthReview', 'conditionCluster_label',
       'drugNameCluster_label',
       ...
       '336', '337', '338', '339', '340', '341', '342', '343', '344',
       'sentiment'],
      dtype='object', length=356)

In [207]:
# Convert condition into multiple columns of dummy variables

dummies_sentiment = pd.get_dummies(embedded_sentiment_df['condition'])

# Concatenate dummy variables with the original DataFrame
embedded_sentiment_df = pd.concat([
    embedded_sentiment_df.drop(columns=['condition']),
    dummies_drugs
], axis=1)

In [208]:
# Convert drugname into multiple columns of dummy variables

dummies_drugs = pd.get_dummies(embedded_sentiment_df['drugName'])

# Concatenate dummy variables with the original DataFrame
embedded_sentiment_df = pd.concat([
    embedded_sentiment_df.drop(columns=['drugName']),
    dummies_drugs
], axis=1)

In [209]:
#Review embedded_sentiment_df
embedded_sentiment_df.head()

Unnamed: 0,uniqueID,review,rating,date,usefulCount,lengthReview,conditionCluster_label,drugNameCluster_label,0,1,...,Zovirax Ointment,Zyban,Zyclara,Zymine,Zyprexa,Zyprexa Zydis,Zyrtec,Zyvox,ella,femhrt
0,95260,My son is halfway through his fourth week of I...,8.0,27-Apr-10,192.0,712.0,2.0,4.0,6.256752,0.08955,...,0,0,0,0,0,0,0,0,0,0
1,92703,I used to take another oral contraceptive whic...,5.0,14-Dec-09,17.0,708.0,9.0,6.0,-3.616027,-1.046721,...,0,0,0,0,0,0,0,0,0,0
2,138000,This is my first time using any form of birth ...,8.0,3-Nov-15,10.0,428.0,9.0,4.0,-5.94492,-0.564254,...,0,0,0,0,0,0,0,0,0,0
3,35696,Suboxone has completely turned my life around ...,9.0,27-Nov-16,37.0,669.0,0.0,2.0,1.624358,0.118632,...,0,0,0,0,0,0,0,0,0,0
4,155963,2nd day on 5mg started to work with rock hard ...,2.0,28-Nov-15,43.0,373.0,0.0,5.0,-4.455269,-1.187546,...,0,0,0,0,0,0,0,0,0,0


In [210]:
# Drop all the non- beneficial columns

columns_to_drop = ['uniqueID', 'review', 'date', 'lengthReview', 'conditionCluster_label']

embedded_sentiment_df_new = embedded_sentiment_df.drop(columns=columns_to_drop)
embedded_sentiment_df_new.head(5)

Unnamed: 0,rating,usefulCount,drugNameCluster_label,0,1,2,3,4,5,6,...,Zovirax Ointment,Zyban,Zyclara,Zymine,Zyprexa,Zyprexa Zydis,Zyrtec,Zyvox,ella,femhrt
0,8.0,192.0,4.0,6.256752,0.08955,-15.717361,7.26135,0.713106,-1.665947,-5.037073,...,0,0,0,0,0,0,0,0,0,0
1,5.0,17.0,6.0,-3.616027,-1.046721,-3.250082,-6.424401,1.245648,0.238435,1.832553,...,0,0,0,0,0,0,0,0,0,0
2,8.0,10.0,4.0,-5.94492,-0.564254,-3.821374,-2.283092,-7.821428,-1.08098,1.283532,...,0,0,0,0,0,0,0,0,0,0
3,9.0,37.0,2.0,1.624358,0.118632,-2.822628,-9.034972,-0.346689,1.570179,0.553306,...,0,0,0,0,0,0,0,0,0,0
4,2.0,43.0,5.0,-4.455269,-1.187546,3.13366,-4.596779,-2.269232,-5.121815,-2.388084,...,0,0,0,0,0,0,0,0,0,0


In [211]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

# Split our preprocessed data into our features and target arrays

from keras.utils import to_categorical

# Separate the target variable
target = embedded_sentiment_df_new['drugNameCluster_label']

# Extract features (excluding the target variable)
features = embedded_sentiment_df_new.drop(columns=['drugNameCluster_label'])

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=30)

# Convert target labels to one-hot encoded format
y_train_encoded = to_categorical(y_train, num_classes=10)
y_test_encoded = to_categorical(y_test, num_classes=10)

# Verify the unique values in the target variable
print(target.unique())

# Check the shapes of the one-hot encoded target labels
print("Shape of y_train_encoded:", y_train_encoded.shape)
print("Shape of y_test_encoded:", y_test_encoded.shape)

[4. 6. 2. 5. 0. 1. 7. 3. 9. 8.]
Shape of y_train_encoded: (7920, 10)
Shape of y_test_encoded: (3395, 10)


In [212]:
# Review feature columns
features.columns

Index(['rating', 'usefulCount', '0', '1', '2', '3', '4', '5', '6', '7',
       ...
       'Zovirax Ointment', 'Zyban', 'Zyclara', 'Zymine', 'Zyprexa',
       'Zyprexa Zydis', 'Zyrtec', 'Zyvox', 'ella', 'femhrt'],
      dtype='object', length=2190)

In [213]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [214]:
# Check the number of dimensions, make it a variable so it passes into

num_dimensions = embedded_sentiment_df_new.shape[1]-1
print(num_dimensions)

2190


In [215]:
# Import our dependencies
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


nn = tf.keras.models.Sequential()

# Input layer
nn.add(tf.keras.layers.Dense(units=332, activation="sigmoid", input_dim=num_dimensions))

# Second hidden layer
nn.add(Dense(units=250, activation='relu'))

# Third hidden layer (example of adding an additional hidden layer)
nn.add(Dense(units=200, activation='relu'))

#Fourth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=100, activation='relu'))

#Fifth hidden layer (example of adding another hidden layer)

#Sixth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=50, activation='relu'))

#Seventh hidden layer (example of adding another hidden layer)
nn.add(Dense(units=25, activation='relu'))

#Eighth hidden layer (example of adding another hidden layer)
nn.add(Dense(units=15, activation='relu'))

# Output layer
nn.add(Dense(units=10, activation='softmax'))  # 3 units for 3 classes, softmax activation

# Check the structure of the model
nn.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_67 (Dense)            (None, 332)               727412    
                                                                 
 dense_68 (Dense)            (None, 250)               83250     
                                                                 
 dense_69 (Dense)            (None, 200)               50200     
                                                                 
 dense_70 (Dense)            (None, 100)               20100     
                                                                 
 dense_71 (Dense)            (None, 50)                5050      
                                                                 
 dense_72 (Dense)            (None, 25)                1275      
                                                                 
 dense_73 (Dense)            (None, 15)               

In [216]:
#compile the model
nn.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [217]:
 # Fit the model to the training data
fit_model = nn.fit(X_train_scaled, y_train_encoded, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [218]:
# Evaluate the model using the test data
test_loss, test_accuracy = nn.evaluate(X_test_scaled, y_test_encoded)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

Test Loss: 0.3547576367855072
Test Accuracy: 0.9540500640869141


In [219]:
import numpy as np
# Get predicted probabilities for each class
y_pred_probabilities = nn.predict(X_test_scaled)
# Convert probabilities to predicted classes by selecting the class with the highest probability
y_pred = np.argmax(y_pred_probabilities, axis=1)
# Convert one-hot encoded y_test_encoded back to categorical labels
y_test_categorical = np.argmax(y_test_encoded, axis=1)
# Create a DataFrame to compare predicted and actual classes
result_df = pd.DataFrame({'Actual': y_test_categorical, 'Predicted': y_pred})
# Add a column indicating whether the prediction was correct
result_df['Correct'] = result_df['Actual'] == result_df['Predicted']
# Display the DataFrame
print(result_df)
# Calculate accuracy manually
accuracy = result_df['Correct'].mean()
print("Manual Test Accuracy:", accuracy)

      Actual  Predicted  Correct
0          1          1     True
1          3          3     True
2          4          4     True
3          4          1    False
4          4          4     True
...      ...        ...      ...
3390       1          1     True
3391       4          4     True
3392       7          7     True
3393       0          0     True
3394       3          3     True

[3395 rows x 3 columns]
Manual Test Accuracy: 0.9540500736377026


In [222]:

from google.colab import files
# Export result_df to a CSV file
result_df.to_csv('DrugName_prediction_results.csv', index=False)

In [224]:
# Download the CSV file
files.download('DrugName_prediction_results.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>