In [None]:
import os
from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import joblib
from sklearn.impute import SimpleImputer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack

# Mount Google Drive
drive.mount('/content/drive')

# File location
FILE_PATH = '/content/drive/MyDrive/Ml Project/new22.xlsx'

# Function to preprocess the DOSAGE and PRICE columns
def preprocess_columns(df):
    try:
        # Preprocess the DOSAGE column
        df['DOSAGE'] = df['DOSAGE'].astype(str).str.extract(r'(\d+\.?\d*)').astype(float)

        # Preprocess the PRICE column (ensure numeric type)
        df['PRICE'] = pd.to_numeric(df['PRICE'], errors='coerce')  # Convert to numeric, set invalid values to NaN
        return df
    except Exception as e:
        raise ValueError(f"Error in preprocessing columns: {e}")

# Function to encode features
def encode_features(df, vectorizer=None):
    try:
        df = preprocess_columns(df)

        if vectorizer is None:
            vectorizer = TfidfVectorizer()
            content_vectors = vectorizer.fit_transform(df['CONTENT'])
        else:
            content_vectors = vectorizer.transform(df['CONTENT'])

        # Efficiently combine sparse and dense data
        dosage_values = np.nan_to_num(df[['DOSAGE', 'PRICE']].values)  # Handle NaN values for DOSAGE and PRICE
        features = hstack([content_vectors, dosage_values])

        return features, vectorizer
    except KeyError as e:
        raise KeyError(f"Missing required column: {e}")
    except Exception as e:
        raise ValueError(f"Error in encoding features: {e}")

# Function to train the ML model
def train_model():
    try:
        df = pd.read_excel(FILE_PATH).dropna(subset=['CONTENT', 'TYPE'])  # Ensure essential columns are not NaN
        print("Class Distribution Before Training:")
        print(df['TYPE'].value_counts())

        features, vectorizer = encode_features(df)
        target = df['TYPE'].apply(lambda x: 1 if x == 'GENERIC' else 0)

        # Handle NaN values in features using imputer
        imputer = SimpleImputer(strategy='mean')  # Replace NaN with column mean
        features_imputed = imputer.fit_transform(features.toarray())

        smote = SMOTE(random_state=42)
        features_resampled, target_resampled = smote.fit_resample(features_imputed, target)
        X_train, X_test, y_train, y_test = train_test_split(features_resampled, target_resampled, test_size=0.2, random_state=42)

        model = RandomForestClassifier(random_state=42, class_weight='balanced')
        model.fit(X_train, y_train)

        predictions = model.predict(X_test)
        print("Model Performance:")
        print(classification_report(y_test, predictions))
        print("Accuracy:", accuracy_score(y_test, predictions))

        os.makedirs('/content/models', exist_ok=True)
        joblib.dump(model, '/content/models/medicine_model.pkl')
        joblib.dump(vectorizer, '/content/models/content_vectorizer.pkl')
        joblib.dump(imputer, '/content/models/imputer.pkl')  # Save the imputer for later use
    except FileNotFoundError:
        raise FileNotFoundError(f"The file at {FILE_PATH} was not found. Please verify the file path.")
    except Exception as e:
        raise ValueError(f"Error during training: {e}")

# Function to predict similar generic medicines
def predict_generic(input_name: str, similarity_threshold=0.8):
    try:
        df = pd.read_excel(FILE_PATH)
        df = preprocess_columns(df)

        # Load model, vectorizer, and imputer
        model = joblib.load('/content/models/medicine_model.pkl')
        vectorizer = joblib.load('/content/models/content_vectorizer.pkl')
        imputer = joblib.load('/content/models/imputer.pkl')

        # Find the standard medicine
        standard_med = df[df['NAME'] == input_name]
        if standard_med.empty:
            return f"Error: Standard medicine '{input_name}' not found."

        standard_content = standard_med['CONTENT'].iloc[0]
        standard_dosage = standard_med['DOSAGE'].iloc[0]
        standard_price = standard_med['PRICE'].iloc[0]  # Extract price if available

        # Preprocess generic medicines
        generic_df = df[df['TYPE'] == 'GENERIC'].copy()
        generic_vectors = vectorizer.transform(generic_df['CONTENT'])

        # Vectorize the standard medicine content
        standard_vector = vectorizer.transform([standard_content])

        # Calculate cosine similarity between the standard and generic content
        similarity_scores = cosine_similarity(standard_vector, generic_vectors).flatten()
        generic_df['Content_Similarity'] = similarity_scores

        # Filter generics based on similarity threshold
        filtered_generics = generic_df[generic_df['Content_Similarity'] >= similarity_threshold]
        if filtered_generics.empty:
            return f"No generic medicines found with content similar to '{input_name}'."

        # Predict probabilities on filtered generics
        filtered_generic_features = hstack([
            vectorizer.transform(filtered_generics['CONTENT']),
            filtered_generics[['DOSAGE', 'PRICE']].values
        ])
        filtered_generic_features_imputed = imputer.transform(filtered_generic_features.toarray())

        filtered_generics['Prediction_Prob'] = model.predict_proba(filtered_generic_features_imputed)[:, 1]

        # Sort by probability and return top results with PRICE
        top_generics = filtered_generics.sort_values(
            by=['Prediction_Prob', 'Content_Similarity'], ascending=False
        ).iloc[:5]

        return top_generics[['NAME', 'DOSAGE', 'PRICE', 'Content_Similarity', 'Prediction_Prob']]
    except FileNotFoundError:
        raise FileNotFoundError("Model or required files not found. Ensure the model is trained and saved correctly.")
    except Exception as e:
        raise ValueError(f"Error during prediction: {e}")

# Main function
def main():
    print("Training the ML model...")
    train_model()
    input_name = input("Enter the name of the standard medicine: ")
    try:
        similar_generics = predict_generic(input_name)
        print("\nTop matching generic medicines with PRICE:")
        print(similar_generics)
    except Exception as e:
        print(f"Error: {e}")

if __name__ == '__main__':
    main()
