In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
import joblib
from flask import Flask, request, jsonify

# Load NLTK resources
import nltk
nltk.download('stopwords')
print('done')

done


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/masubhaat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def load_data(file_path):
    
    #Load the dataset from a CSV file
    df = pd.read_csv(file_path)
    return df

def clean_data(df):
    #Clean the dataset by handling missing values, duplicates, and data types.
    # Handle missing values
    for col in df.columns:
        if df[col].dtype in ['int64', 'float64']:  # Numeric columns
            df[col].fillna(df[col].mean(), inplace=True)
        elif df[col].dtype == 'object':  # Object (string) columns
            df[col].fillna('Unknown', inplace=True)

    # Drop duplicates
    df.drop_duplicates(inplace=True)

    # Convert date columns (case-insensitive)
    date_columns = [col for col in df.columns if re.search(r'date', col, re.IGNORECASE)]
    for col in date_columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')
    

    return df


In [3]:
file_path = '/home/masubhaat/recommender_project/data/consumerdata.csv'  
# Update with your dataset path
data = load_data(file_path)
cleaned_data = clean_data(data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Unknown', inplace=True)


In [4]:
data.head(1000)

Unnamed: 0,ProductID,ProductCategory,ProductBrand,ProductPrice,CustomerAge,CustomerGender,PurchaseFrequency,CustomerSatisfaction,PurchaseIntent
0,5874,Smartphones,Other Brands,312.949668,18,0,2,1,0
1,5875,Smart Watches,Samsung,980.389404,35,1,7,2,1
2,5876,Tablets,Samsung,2606.718293,63,0,1,5,1
3,5877,Smartphones,Samsung,870.395450,63,1,10,3,1
4,5878,Tablets,Sony,1798.955875,57,0,17,3,0
...,...,...,...,...,...,...,...,...,...
995,6869,Smart Watches,HP,2906.298447,46,1,9,3,1
996,6870,Headphones,Other Brands,2405.368758,58,1,8,2,1
997,6871,Smartphones,Other Brands,2369.000167,27,0,19,2,0
998,6872,Laptops,HP,1931.805793,40,0,7,5,1


In [5]:
def preprocess_text(text):
    # Preprocess text data by removing special characters, stopwords, and stemming.
    
    if pd.isna(text):  # Handle NaN values
        return ''
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    text = text.strip()
    
    # Tokenization and stemming
    stop_words = set(stopwords.words('english'))
    ps = PorterStemmer()
    words = text.split()
    words = [ps.stem(word) for word in words if word not in stop_words]
    
    return ' '.join(words)

# Apply text preprocessing to review and summary columns (case-insensitive)
text_columns = [col for col in cleaned_data.columns if re.search(r'name', col, re.IGNORECASE)]
for col in text_columns:
    cleaned_data[f'processed_{col}'] = cleaned_data[col].apply(preprocess_text)

    

In [6]:
def prepare_data(df):

    #Prepare data for modeling by separating features and target
    # Assuming 'target' is the column you want to predict
    
    X = df.drop('ProductID', axis=1)
    y = df['ProductID']
    
    # Convert categorical variables to dummy variables
    X = pd.get_dummies(X, drop_first=True)
    
    return X, y

# Split the data into training and testing sets

X, y = prepare_data(cleaned_data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split

-X
# Example: assume X and y are your features and labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def build_model(X_train, y_train):
    # Initialize the model with a random_state for reproducibility
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    return model

# Baseline model training
model = build_model(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Baseline Accuracy: {accuracy:.2f}')

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42), 
    param_grid, 
    cv=3,  # Adjusted number of splits
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

print("Best Hyperparameters:", grid_search.best_params_)

# Evaluate the tuned model
y_pred_best = best_model.predict(X_test)

print(f'Tuned Accuracy: {accuracy_score(y_test, y_pred_best):.2f}')

print(classification_report(y_test, y_pred_best))


Baseline Accuracy: 0.10




Best Hyperparameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Tuned Accuracy: 0.10
              precision    recall  f1-score   support

  B002SZEOLG       0.00      0.00      0.00         0
  B005FYNT3G       0.00      0.00      0.00         0
  B005LJQMZC       0.00      0.00      0.00         0
  B006LW0WDQ       0.00      0.00      0.00         1
  B0073QGKAS       0.00      0.00      0.00         1
  B0085W2MUQ       0.00      0.00      0.00         0
  B0088TKTY2       0.00      0.00      0.00         1
  B008IFXQFU       0.00      0.00      0.00         0
  B00935MD1C       0.00      0.00      0.00         0
  B00935MGHS       0.00      0.00      0.00         1
  B009P2LIL4       0.00      0.00      0.00         0
  B009P2LITG       0.00      0.00      0.00         1
  B00A7PLVU6       0.00      0.00      0.00         1
  B00EYW1U68       0.00      0.00      0.00         0
  B00F159RIK       0.00      0.00      0.00         1
  B00GE55L22       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
def cluster_data(X):
    # """Cluster data using KMeans."""
    kmeans = KMeans(n_clusters=5, random_state=42)  # Adjust number of clusters as needed
    kmeans.fit(X)
    return kmeans.labels_

# Apply clustering
labels = cluster_data(X)
print(f'Cluster labels: {labels}')

Cluster labels: [2 0 1 ... 1 3 4]


In [10]:
def recommend_products(X, product_id, n_neighbors=5):
    # """Recommend similar products using NearestNeighbors."""
    model = NearestNeighbors(n_neighbors=n_neighbors)
    model.fit(X)
    distances, indices = model.kneighbors(X.iloc[product_id].values.reshape(1, -1))
    return indices.flatten()

# Example of recommending products
product_id = 0  # Replace with the actual product index you want to recommend similar products for
recommended_indices = recommend_products(X, product_id)
print(f'Recommended product indices: {recommended_indices}')

Recommended product indices: [   0 5243 2365 1569 1376]




In [11]:
from flask import Flask, request, jsonify
import joblib

# Save the model
joblib.dump(model, 'model.pkl')

# Load the model
model = joblib.load('model.pkl')

app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json(force=True)
    input_data = pd.DataFrame(data, index=[0])
    
    # Preprocess input data
    input_data = pd.get_dummies(input_data, drop_first=True)
    
    # Ensure the input data has the same columns as the training data
    input_data = input_data.reindex(columns=X_train.columns, fill_value=0)
    
    prediction = model.predict(input_data)
    return jsonify({'prediction': prediction[0]})

if __name__ == '__main__':
    app.run(debug=True)

NameError: name 'model' is not defined

In [19]:
import requests

url = 'http://127.0.0.1:5000/predict'
data = {
 
    'asin': 'B0B5LVS732',
    'related_asin': 'B0BF57RN3K',
    'price': 19.99,            # Example numeric feature
    'rating': 4.5              # Example numeric feature
}

    # Add other features as needed


response = requests.post(url, json=data)
print(response.json())

ConnectionError: HTTPConnectionPool(host='127.0.0.1', port=5000): Max retries exceeded with url: /predict (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x715b61694e80>: Failed to establish a new connection: [Errno 111] Connection refused'))