In [12]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

nltk.download('stopwords')
nltk.download('wordnet')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [13]:
# !pip install transformers

In [14]:
# Load your training and test datasets
df = pd.read_csv('drugsComTrain_raw.csv')
test_df = pd.read_csv('drugsComTest_raw.csv')

# Handle missing values
df.dropna(inplace=True)

# Convert the date column to a standard format
df['date'] = pd.to_datetime(df['date'], format='%d-%b-%y')

In [15]:
# Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return ' '.join(tokens)

In [16]:
df['processed_review'] = df['review'].apply(preprocess_text)
df['processed_condition'] = df['condition'].apply(preprocess_text)

# Combine processed condition and review columns
df['combined_text'] = df['processed_condition'] + " " + df['processed_review']

In [17]:
# Sentiment Analysis
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [19]:
def sentiment_score(review):
    # Tokenize and truncate the input text to a maximum of 512 tokens
    tokens = tokenizer.encode(review, return_tensors='pt', truncation=True, max_length=512)
    result = model(tokens)
    return torch.argmax(result.logits) + 1

# Calculate sentiment scores for all reviews
df['sentiment'] = df['review'].apply(sentiment_score)


In [None]:
# TF-IDF Vectorization of combined text
vectorizer = TfidfVectorizer(max_features=500)
X_combined_text = vectorizer.fit_transform(df['combined_text']).toarray()

# Combine TF-IDF features with other numerical features including sentiment score
numerical_features = df[['rating', 'usefulCount', 'sentiment']].values
X_combined = np.hstack((X_combined_text, numerical_features))

In [None]:
# Scaling numerical features
scaler = StandardScaler()
X_combined_scaled = scaler.fit_transform(X_combined)

pca = PCA(n_components=50)
X_combined_pca = pca.fit_transform(X_combined_scaled)

In [None]:
# Optional: Visualize variance explained by each PCA component
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance Explained')
plt.show()

In [None]:
# Elbow method to find optimal number of clusters
wcss = []
for i in range(2, 11):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(X_combined_pca)
    wcss.append(kmeans.inertia_)

In [None]:
# Plot the elbow method graph
plt.plot(range(2, 11), wcss)
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS (Within-Cluster Sum of Squares)')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.show()

In [None]:
optimal_clusters = 4  # Choose based on the elbow plot
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_combined_pca)

In [None]:
# Recommendation function
def recommend_drug(condition, top_n=3):
    processed_condition = preprocess_text(condition)
    input_condition_vector = vectorizer.transform([processed_condition]).toarray()
    input_combined = np.hstack((input_condition_vector, np.zeros((1, numerical_features.shape[1]))))
    input_combined_scaled = scaler.transform(input_combined)
    input_combined_pca = pca.transform(input_combined_scaled)
    predicted_cluster = kmeans.predict(input_combined_pca)[0]

    recommended_drugs = df[df['Cluster'] == predicted_cluster].sort_values(by=['sentiment', 'rating', 'usefulCount'], ascending=False)
    return recommended_drugs['drugName'].head(top_n).tolist()

In [None]:
# Example usage
user_condition = "Weight Loss"
recommended_drugs = recommend_drug(user_condition)
print(f"Recommended drugs for '{user_condition}': {recommended_drugs}")
