In [1]:
# Import required libraries

import pandas as pd
import numpy as np
import re
import joblib
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from flask import Flask, request, jsonify
# Download required NLTK data

import nltk
nltk.download('stopwords', quiet=True)

True

In [2]:
def load_data(file_path):
    """
    Load dataset from a CSV file
    Args:
        file_path (str): Path to the CSV file
    Returns:
        pd.DataFrame: Loaded dataset
    """
    return pd.read_csv("/home/masubhaat/Product-Recommender/data/consumerdata.csv")

def clean_data(df):
    """
    Clean dataset by handling missing values, duplicates, and data types
    Args:
        df (pd.DataFrame): Input dataframe
    Returns:
        pd.DataFrame: Cleaned dataframe
    """
    # Create a copy to avoid SettingWithCopyWarning
    df = df.copy()
    
    # Handle missing values
    for col in df.columns:
        if df[col].dtype in ['int64', 'float64']:
            # Fill numeric columns with mean
            df[col] = df[col].fillna(df[col].mean())
        else:
            # Fill categorical columns with 'Unknown'
            df[col] = df[col].fillna('Unknown')

    # Drop duplicates
    df.drop_duplicates(inplace=True)

    # Convert date columns (case-insensitive)
    date_columns = [col for col in df.columns if re.search(r'date', col, re.IGNORECASE)]
    for col in date_columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')
        
    return df

In [3]:
def preprocess_text(text):
    """
    Preprocess text data by removing special characters, stopwords, and applying stemming
    Args:
        text (str): Input text
    Returns:
        str: Preprocessed text
    """
    if pd.isna(text):
        return ''
    
    # Remove special characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower().strip())
    
    # Remove stopwords and apply stemming
    stop_words = set(stopwords.words('english'))
    ps = PorterStemmer()
    words = [ps.stem(word) for word in text.split() if word not in stop_words]
    
    return ' '.join(words)

def prepare_data(df):
    """
    Prepare data for modeling by separating features and target
    Args:
        df (pd.DataFrame): Input dataframe
    Returns:
        tuple: (X, y) features and target variables
    """
    # Create copy to avoid modifications to original dataframe
    df = df.copy()
    
    # Separate features and target
    X = df.drop('PurchaseIntent', axis=1)
    y = df['PurchaseIntent']
    
    # Convert categorical variables to dummy variables
    X = pd.get_dummies(X, drop_first=True)
    
    return X, y

In [4]:
from sklearn.model_selection import GridSearchCV


def build_and_train_model(X_train, y_train):
    """
    Build and train a Random Forest model with hyperparameter tuning
    Args:
        X_train (pd.DataFrame): Training features
        y_train (pd.Series): Training target
    Returns:
        RandomForestClassifier: Trained model
    """
    # Define parameter grid for tuning
    param_grid = {
        'n_estimators': [100, 200, 500],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }
    
    # Initialize base model
    base_model = RandomForestClassifier(random_state=42)
    
    # Perform grid search with cross-validation
    grid_search = GridSearchCV(
        base_model,
        param_grid,
        cv=5,
        n_jobs=-1,
        scoring='accuracy'
    )
    
    # Fit the model
    grid_search.fit(X_train, y_train)
    
    return grid_search.best_estimator_

In [5]:
def recommend_products(X, product_id, n_neighbors=5):
    """
    Recommend similar products using k-nearest neighbors
    Args:
        X (pd.DataFrame): Product features
        product_id (int): Reference product ID
        n_neighbors (int): Number of recommendations to return
    Returns:
        np.array: Indices of recommended products
    """
    # Initialize and fit the nearest neighbors model
    model = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine')
    model.fit(X)
    
    # Get recommendations
    distances, indices = model.kneighbors(
        X.iloc[product_id].values.reshape(1, -1)
    )
    
    return indices.flatten()

In [6]:
# Load and prepare the data
print("Loading data...")
df = load_data('../data/consumerdata.csv')

print("Cleaning data...")
df = clean_data(df)

print("Preparing data...")
X, y = prepare_data(df)

# Split the data
print("Splitting data...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
print("Training model...")
model = build_and_train_model(X_train, y_train)

# Evaluate the model
print("Evaluating model...")
y_pred = model.predict(X_test)
print('Model Accuracy:', accuracy_score(y_test, y_pred))
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

# Save the model
print("Saving model...")
joblib.dump(model, '../models/model.pkl')
print('Model saved successfully to ../models/model.pkl')

Loading data...


FileNotFoundError: [Errno 2] No such file or directory: '/home/masubhaat/Product-Recommender/data/consumerdata.csv'

In [7]:
# Initialize Flask application
app = Flask(__name__)

# Load the trained model
model = joblib.load('../models/model.pkl')

@app.route('/predict', methods=['POST'])
def predict():
    """
    API endpoint for making predictions
    Returns:
        json: Prediction result
    """
    try:
        # Get input data
        data = request.get_json(force=True)
        input_data = pd.DataFrame(data, index=[0])
        
        # Preprocess input data
        input_data = pd.get_dummies(input_data, drop_first=True)
        
        # Ensure input data has same columns as training data
        input_data = input_data.reindex(columns=model.feature_names_, fill_value=0)
        
        # Make prediction
        prediction = model.predict(input_data)
        
        return jsonify({'prediction': int(prediction[0])})
    
    except Exception as e:
        return jsonify({'error': str(e)}), 400

if __name__ == '__main__':
    app.run(debug=True)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with watchdog (inotify)
Traceback (most recent call last):
  File "/home/masubhaat/anaconda3/lib/python3.12/runpy.py", line 198, in _run_module_as_main
    return _run_code(code, main_globals, None,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/masubhaat/anaconda3/lib/python3.12/runpy.py", line 88, in _run_code
    exec(code, run_globals)
  File "/home/masubhaat/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/masubhaat/anaconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 1074, in launch_instance
    app.initialize(argv)
  File "/home/masubhaat/anaconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 118, in inner
    return method(app, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/masubhaat/anaconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py",

SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
