In [1]:
import pandas as pd
import numpy as np
import gensim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import RandomOverSampler
from gensim.models import Word2Vec
from nltk.stem import PorterStemmer
from sklearn.preprocessing import LabelEncoder

# Load Cleaned Dataset

In [2]:
df = pd.read_csv("clean_drug_data.csv")

In [3]:
df.head()

Unnamed: 0,drugName,condition,review,rating,date,usefulCount,day,month,year
0,Valsartan,Left Ventricular Dysfunction,"""""""It has no side effect, I take it in combina...",9.0,2012-05-20,27.0,20,5,2012
1,Guanfacine,ADHD,"""""""My son is halfway through his fourth week o...",8.0,2010-04-27,192.0,27,4,2010
2,Lybrel,Birth Control,"""""""I used to take another oral contraceptive, ...",5.0,2009-12-14,17.0,14,12,2009
3,Ortho Evra,Birth Control,"""""""This is my first time using any form of bir...",8.0,2015-11-03,10.0,3,11,2015
4,Buprenorphine / naloxone,Opiate Dependence,"""""""Suboxone has completely turned my life arou...",9.0,2016-11-27,37.0,27,11,2016


# Dataset Splitting

In [4]:
X = df.drop(columns=["rating"])
y = df["rating"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Text Processing

### Tokenization & Stemming

In [6]:
stemmer = PorterStemmer()

In [7]:
# define the function of tokenization and stemming
def preprocess_text(text): 
    tokens = gensim.utils.simple_preprocess(text, deacc=True)
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return stemmed_tokens

In [8]:
X_train["tokenized_review"] = X_train["review"].apply(preprocess_text)
X_test["tokenized_review"] = X_test["review"].apply(preprocess_text)

### Word Embeddings (Word2Vec)

In [9]:
w2v_model = Word2Vec(sentences=X_train["tokenized_review"], vector_size=100, window=5, min_count=2, workers=4)

In [10]:
# Convert text to word vector averages
def get_avg_word2vec(tokens, model, vector_size=100):
    valid_words = [word for word in tokens if word in model.wv]
    if len(valid_words) == 0:
        return np.zeros(vector_size)
    return np.mean(model.wv[valid_words], axis=0)

In [11]:
X_train["word2vec"] = X_train["tokenized_review"].apply(lambda tokens: get_avg_word2vec(tokens, w2v_model, 100))
X_test["word2vec"] = X_test["tokenized_review"].apply(lambda tokens: get_avg_word2vec(tokens, w2v_model, 100))

In [13]:
# Drop text columns
X_train.drop(columns=["review", "tokenized_review"], inplace=True)
X_test.drop(columns=["review", "tokenized_review"], inplace=True)

# Feature Engineering (Encoding Categorical Variables)

In [14]:
categorical_cols = ["drugName", "condition"]
label_encoders = {}

In [15]:
# Ensure all categorical data is converted to strings
for col in categorical_cols:
    X_train[col] = X_train[col].astype(str)
    X_test[col] = X_test[col].astype(str)

    # Fit LabelEncoder on combined train & test data
    le = LabelEncoder()
    le.fit(pd.concat([X_train[col], X_test[col]], axis=0))  # Fit on full dataset
    
    # Transform train & test separately
    X_train[col] = le.transform(X_train[col])
    X_test[col] = le.transform(X_test[col])
    
    label_encoders[col] = le  # Store encoders for future use



# Scaling & Oversampling

### Scaling

In [16]:
scaler = StandardScaler()

In [None]:
# Remove non-numeric columns before scaling
non_numeric_cols = ["drugName", "condition", "date"]
X_train.drop(columns=[col for col in non_numeric_cols if col in X_train.columns], inplace=True)
X_test.drop(columns=[col for col in non_numeric_cols if col in X_test.columns], inplace=True)

In [17]:
# Convert word2vec embeddings into a fixed-dimension numeric array
def flatten_embeddings(df, embedding_col="word2vec", vector_size=100):
    word2vec_array = np.vstack(df[embedding_col].values)  # Stack into matrix
    word2vec_df = pd.DataFrame(word2vec_array, index=df.index, columns=[f"w2v_{i}" for i in range(vector_size)])
    return pd.concat([df.drop(columns=[embedding_col]), word2vec_df], axis=1)

In [18]:
# Apply transformation to train & test sets
X_train = flatten_embeddings(X_train, "word2vec", vector_size=100)
X_test = flatten_embeddings(X_test, "word2vec", vector_size=100)

In [19]:
# Ensure all remaining columns are numeric before scaling
X_train = X_train.apply(pd.to_numeric, errors="coerce")
X_test = X_test.apply(pd.to_numeric, errors="coerce")

In [20]:
# Fill any NaN values (caused by coercion) with 0
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

In [21]:
# Standardize numerical features (fit only on training set)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit on train
X_test_scaled = scaler.transform(X_test)  # Transform test using same scaler

### Apply Oversampling to Balance Classes on training data

In [22]:
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train_scaled, y_train)

In [23]:
# Convert back to DataFrame (for easier handling)
X_train_resampled = pd.DataFrame(X_train_resampled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Save Processed Data for Modeling

In [24]:
X_train_resampled.to_csv("X_train_processed.csv", index=False)
X_test_scaled.to_csv("X_test_processed.csv", index=False)
pd.DataFrame(y_train_resampled, columns=["rating"]).to_csv("y_train.csv", index=False)
pd.DataFrame(y_test, columns=["rating"]).to_csv("y_test.csv", index=False)