In [None]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m122.9/126.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import warnings
from difflib import get_close_matches
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
warnings.filterwarnings('ignore')

class DrugRecommendationSystem:
    def __init__(self, train_path, test_path):
        try:
            # Initialize sentiment analyzer
            print("Initializing sentiment analyzer...")
            self.sentiment_analyzer = SentimentIntensityAnalyzer()

            # Load datasets
            print("Loading datasets...")
            self.train_df = pd.read_csv(
                train_path,
                encoding='utf-8',
                on_bad_lines='skip',
                usecols=['condition', 'drugName', 'rating', 'review', 'usefulCount']
            )
            self.test_df = pd.read_csv(
                test_path,
                encoding='utf-8',
                on_bad_lines='skip',
                usecols=['condition', 'drugName', 'rating', 'review', 'usefulCount']
            )

            # Clean data
            print("Cleaning data...")
            self.preprocess_data()

            # Calculate drug statistics with sentiment
            print("Calculating drug statistics and analyzing sentiments...")
            self.calculate_drug_stats()

            print("System initialized successfully!")

        except Exception as e:
            raise Exception(f"Error initializing system: {str(e)}")

    def analyze_sentiment(self, review):
        """Analyze sentiment of a review using VADER"""
        try:
            if pd.isna(review) or review == '':
                return 0

            # Get sentiment scores
            scores = self.sentiment_analyzer.polarity_scores(review)

            # Return compound score (normalized between -1 and 1)
            return scores['compound']

        except Exception as e:
            print(f"Error in sentiment analysis: {str(e)}")
            return 0

    def analyze_review_aspects(self, review):
        """Analyze different aspects of the review"""
        try:
            if pd.isna(review) or review == '':
                return {
                    'effectiveness': 0,
                    'side_effects': 0,
                    'overall': 0
                }

            # Split review into sentences
            sentences = review.lower().split('.')

            # Keywords for different aspects
            effectiveness_keywords = ['effective', 'worked', 'helps', 'improved', 'relief']
            side_effects_keywords = ['side effect', 'reaction', 'nausea', 'headache', 'pain']

            # Analyze each aspect
            effectiveness_scores = []
            side_effects_scores = []

            for sentence in sentences:
                # Get sentiment of the sentence
                sentiment = self.sentiment_analyzer.polarity_scores(sentence)['compound']

                # Check for effectiveness mentions
                if any(keyword in sentence for keyword in effectiveness_keywords):
                    effectiveness_scores.append(sentiment)

                # Check for side effects mentions
                if any(keyword in sentence for keyword in side_effects_keywords):
                    side_effects_scores.append(sentiment)

            # Calculate average scores
            return {
                'effectiveness': np.mean(effectiveness_scores) if effectiveness_scores else 0,
                'side_effects': np.mean(side_effects_scores) if side_effects_scores else 0,
                'overall': sentiment
            }

        except Exception as e:
            print(f"Error in aspect analysis: {str(e)}")
            return {'effectiveness': 0, 'side_effects': 0, 'overall': 0}

    def preprocess_data(self):
        """Clean and preprocess the data"""
        # Clean ratings
        self.train_df['rating'] = pd.to_numeric(self.train_df['rating'], errors='coerce')
        self.test_df['rating'] = pd.to_numeric(self.test_df['rating'], errors='coerce')

        # Clean useful counts
        self.train_df['usefulCount'] = pd.to_numeric(self.train_df['usefulCount'], errors='coerce').fillna(0)

        # Drop rows with NaN ratings
        self.train_df = self.train_df.dropna(subset=['rating'])
        self.test_df = self.test_df.dropna(subset=['rating'])

        # Clean condition and drug names
        self.train_df['condition'] = self.train_df['condition'].str.lower().str.strip()
        self.train_df['drugName'] = self.train_df['drugName'].str.strip()

        # Clean reviews
        self.train_df['review'] = self.train_df['review'].fillna('').astype(str)

    def calculate_drug_stats(self):
        """Calculate statistics and sentiment scores for each drug per condition"""
        # Create a list to store results
        stats_list = []

        # Group by condition and drug
        for (condition, drug), group in self.train_df.groupby(['condition', 'drugName']):
            # Get basic stats
            rating_count = len(group)
            rating_mean = group['rating'].mean()
            useful_count = group['usefulCount'].sum()

            # Get sample reviews
            sample_reviews = group['review'].head(3).tolist()

            # Calculate sentiment scores for all reviews
            sentiments = []
            effectiveness_scores = []
            side_effects_scores = []

            for review in group['review']:
                # Get detailed sentiment analysis
                aspects = self.analyze_review_aspects(review)
                sentiments.append(aspects['overall'])
                effectiveness_scores.append(aspects['effectiveness'])
                side_effects_scores.append(aspects['side_effects'])

            # Add to results
            stats_list.append({
                'condition': condition,
                'drugName': drug,
                'rating_count': rating_count,
                'rating_mean': rating_mean,
                'useful_count': useful_count,
                'sentiment_mean': np.mean(sentiments),
                'effectiveness_score': np.mean(effectiveness_scores),
                'side_effects_score': np.mean(side_effects_scores),
                'sample_reviews': sample_reviews
            })

        # Convert to DataFrame
        self.drug_stats = pd.DataFrame(stats_list)

    def recommend_drugs(self, condition, top_n=5):
        """Recommend drugs with sentiment analysis"""
        try:
            # Find matching conditions
            matching_conditions = self.find_matching_conditions(condition)

            if not matching_conditions:
                return f"No matching conditions found for: {condition}"

            # Handle condition selection
            if condition.lower().strip() in matching_conditions:
                selected_condition = condition.lower().strip()
            else:
                print("\nDid you mean one of these conditions?")
                for i, cond in enumerate(matching_conditions, 1):
                    print(f"{i}. {cond}")

                while True:
                    try:
                        choice = int(input("\nEnter the number of your condition (0 to cancel): "))
                        if choice == 0:
                            return "Operation cancelled by user."
                        if 1 <= choice <= len(matching_conditions):
                            selected_condition = matching_conditions[choice - 1]
                            break
                        print(f"Please enter a number between 0 and {len(matching_conditions)}")
                    except ValueError:
                        print("Please enter a valid number")

            # Get drugs for selected condition
            condition_drugs = self.drug_stats[self.drug_stats['condition'] == selected_condition].copy()

            if condition_drugs.empty:
                return f"No drugs found for condition: {selected_condition}"

            # Calculate weighted score
            max_count = condition_drugs['rating_count'].max()
            max_useful = condition_drugs['useful_count'].max() if condition_drugs['useful_count'].max() > 0 else 1

            # Complex scoring system incorporating all metrics
            condition_drugs['weighted_score'] = (
                (condition_drugs['rating_mean'] / 10 * 0.3) +  # 30% weight to rating
                (condition_drugs['sentiment_mean'] * 0.2) +    # 20% weight to overall sentiment
                (condition_drugs['effectiveness_score'] * 0.2) + # 20% weight to effectiveness
                (abs(condition_drugs['side_effects_score']) * -0.1) + # 10% negative weight to side effects
                (condition_drugs['rating_count'] / max_count * 0.1) + # 10% weight to number of ratings
                (condition_drugs['useful_count'] / max_useful * 0.1)  # 10% weight to usefulness
            )

            # Get top recommendations
            top_drugs = condition_drugs.nlargest(top_n, 'weighted_score')

            # Format results
            return {
                'condition': selected_condition,
                'recommendations': [{
                    'drug': row['drugName'],
                    'average_rating': round(row['rating_mean'], 2),
                    'sentiment_score': round(row['sentiment_mean'], 3),
                    'effectiveness_score': round(row['effectiveness_score'], 3),
                    'side_effects_score': round(row['side_effects_score'], 3),
                    'number_of_ratings': int(row['rating_count']),
                    'sample_reviews': row['sample_reviews']
                } for _, row in top_drugs.iterrows()]
            }

        except Exception as e:
            return f"Error generating recommendations: {str(e)}"

    def find_matching_conditions(self, user_input):
        """Find closest matching conditions"""
        available_conditions = sorted(self.drug_stats['condition'].unique())
        user_input = user_input.lower().strip()
        matches = get_close_matches(user_input, available_conditions, n=5, cutoff=0.6)
        return matches

# [All previous code remains exactly the same until the main() function]

def main():
    print("\n💊 Advanced Drug Recommendation System 💊")
    print("Based on analysis of real patient experiences")

    # Dataset paths
    train_path = '/content/drive/MyDrive/drugsComTrain_raw.csv'
    test_path = '/content/drive/MyDrive/drugsComTest_raw.csv'

    try:
        print("\nInitializing system and loading datasets...")
        recommender = DrugRecommendationSystem(train_path, test_path)

        while True:
            print("\n📋 Enter a medical condition (or 'exit' to quit)")
            condition = input("Condition: ").strip()

            if condition.lower() == 'exit':
                print("\nThank you for using the Drug Recommendation System. Goodbye!")
                break

            print("\n🔍 Analyzing patient experiences and generating recommendations...")
            results = recommender.recommend_drugs(condition)

            if isinstance(results, dict):
                print(f"\n🏥 Top Treatments for {results['condition'].title()}")
                print("=" * 80)

                for rank, rec in enumerate(results['recommendations'], 1):
                    # Calculate percentage scores for better understanding
                    sentiment_percentage = ((rec['sentiment_score'] + 1) / 2) * 100
                    effectiveness_percentage = ((rec['effectiveness_score'] + 1) / 2) * 100

                    # Create visual rating bars
                    rating_bar = "★" * int(rec['average_rating']) + "☆" * (10 - int(rec['average_rating']))

                    print(f"\n{rank}. {rec['drug'].upper()}")
                    print(f"   {'─' * 75}")
                    print(f"   Overall Rating: {rating_bar} ({rec['average_rating']}/10)")
                    print(f"   Patient Satisfaction: {'▓' * int(sentiment_percentage/10)}{'░' * (10-int(sentiment_percentage/10))} ({sentiment_percentage:.0f}%)")
                    print(f"   Treatment Effectiveness: {'▓' * int(effectiveness_percentage/10)}{'░' * (10-int(effectiveness_percentage/10))} ({effectiveness_percentage:.0f}%)")

                    # Convert side effects score to risk level
                    risk_level = "Low" if rec['side_effects_score'] > -0.3 else \
                               "Moderate" if rec['side_effects_score'] > -0.6 else "High"
                    risk_color = "🟢" if risk_level == "Low" else \
                               "🟡" if risk_level == "Moderate" else "🔴"

                    print(f"   Side Effects Risk: {risk_color} {risk_level}")
                    print(f"   Based on {rec['number_of_ratings']:,} patient reviews")

                    print("\n   📝 Recent Patient Experiences:")
                    for i, review in enumerate(rec['sample_reviews'], 1):
                        if review.strip():  # Only show non-empty reviews
                            formatted_review = review[:200] + "..." if len(review) > 200 else review
                            print(f"      {i}. \"{formatted_review}\"")

                    print(f"\n   {'─' * 75}")

            else:
                print(results)

            choice = input("\nWould you like to check another condition? (y/n): ").lower()
            if choice != 'y':
                print("\n💫 Thank you for using the Drug Recommendation System. Take care! 💫")
                break

    except Exception as e:
        print(f"\n❌ Error initializing the system: {str(e)}")
        print("Please make sure the data files exist and are in the correct format.")

if __name__ == '__main__':
    main()


💊 Advanced Drug Recommendation System 💊
Based on analysis of real patient experiences

Initializing system and loading datasets...
Initializing sentiment analyzer...
Loading datasets...
Cleaning data...
Calculating drug statistics and analyzing sentiments...
System initialized successfully!

📋 Enter a medical condition (or 'exit' to quit)

🔍 Analyzing patient experiences and generating recommendations...

Did you mean one of these conditions?
1. diabetes, type 2
2. diabetes, type 1
3. diabetes insipidus

🏥 Top Treatments for Diabetes, Type 1

1. AFREZZA
   ───────────────────────────────────────────────────────────────────────────
   Overall Rating: ★★★★★★★☆☆☆ (7.67/10)
   Patient Satisfaction: ▓▓▓▓▓░░░░░ (50%)
   Treatment Effectiveness: ▓▓▓▓▓░░░░░ (51%)
   Side Effects Risk: 🟢 Low
   Based on 27 patient reviews

   📝 Recent Patient Experiences:
      1. ""So far my experience is everything that I knew it could do...FASTER INSULIN is what we need and afrezza is here!  Afrezza works i