In [1]:
import time
import warnings
warnings.filterwarnings('ignore')

# For loading the dataframe
import pickle

# For loading data and data manipulation
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

# For cleaning the text
from nltk.corpus import stopwords
from nltk import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

# For converting them into vectors
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# To find the similarity distance between movies
from sklearn.metrics.pairwise import cosine_similarity

# For model building libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Loading the datasets

In [2]:
df_predict = pickle.load(open("eda_data/predict.pkl", "rb"))
df_predict.head()

Unnamed: 0,Preferences,Category
0,I’m more of a tea person 🍵$$$I prefer herbal/d...,sweet
1,"I don’t drink coffee or tea 🤭$$$ Chocolates, C...",hard liquor
2,"I don’t drink coffee or tea 🤭$$$ Chocolates, C...",accessory
3,"I don’t drink coffee or tea 🤭$$$ Chocolates, C...",liquoraccessory
4,"I don’t drink coffee or tea 🤭$$$ Chocolates, C...",sweet


# Clean the preference

In [3]:
# Creating object of tokenizer and lemmatizer
tokenizer = RegexpTokenizer(r"\w+")
lemmatizer = WordNetLemmatizer()

In [4]:
# Function to clean the Preference column
def clean_pref(row):
    # convert into lower character
    row = row.lower()
    # split string into words
    words = tokenizer.tokenize(row)
    # remove stop words and apply lemmatization
    words = [lemmatizer.lemmatize(word.strip()) for word in words if word not in stopwords.words('english')]
    # convert word into string again and return
    return ' '.join(words)

In [None]:
%%time
# Applying clean preference function to transform the text
df_predict['Preference_Clean'] = df_predict['Preferences'].apply(clean_pref)

In [None]:
df_predict.head()

# Split the data into train and test

In [None]:
X = df_predict['Preference_Clean']
y = df_predict['Category']

X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Converting feature into vectors

In [None]:
tfidf_predict = TfidfVectorizer(max_features=3000, stop_words='english')

In [None]:
X_train_vec = tfidf_predict.fit_transform(X_train).toarray()
X_test_vec = tfidf_predict.transform(X_test).toarray()

In [None]:
X_train_vec.shape

# Creating a model

### Increase the model performance:
* Initially there were 39 categories, and I am getting the best accuracy of **18 percent**
* After that I found out there were some categories which were redundant with a difference of upper and lower character, so after converting all categories to lower character, and I am getting the best accuracy of around **20 percent**
* Then to improve further I remove the categories that has less than 50 records, and I am getting the best accuracy of **23 percent**

In [None]:
svc = SVC(kernel='sigmoid', gamma=1.0, probability=True)
knc = KNeighborsClassifier(n_neighbors=97)
bnb = BernoulliNB()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)

In [None]:
%%time
model_perf = []
classifiers = [svc, knc, bnb, mnb, dtc, lrc, rfc, abc, bc, etc, gbdt]
for classifier in classifiers:
    classifier.fit(X_train_vec, y_train)
    y_pred = classifier.predict(X_test_vec)
    accuracy = accuracy_score(y_test, y_pred)
    model_perf.append({'Model': classifier, 'Accuracy': accuracy})

### Model performance

In [None]:
pd.DataFrame(model_perf)

# Creating recommendation model

In [None]:
df_recommend = pickle.load(open("eda_data/recommendation.pkl", 'rb'))
df_recommend.head()

### Converting tags column into vectors

In [None]:
tfidf_recommend = TfidfVectorizer(max_features=5000, stop_words='english')

In [None]:
recommend_vec = tfidf_recommend.fit_transform(df_recommend['Tag']).toarray()
recommend_vec

In [None]:
recommend_vec.shape

### Calculate the cosine distance between the vectors, and the values of matrix represent the percentage of similarity between tags

In [None]:
similarity = cosine_similarity(recommend_vec)
similarity

In [None]:
similarity.shape

### Recommender function that will give the recommended movies closest to the given one

In [None]:
def recommender(product):
    product_index = df_recommend[df_recommend['ProductName'] == product].index[0]
    similarity_percentage = similarity[product_index]
    recommended_product = sorted(list(enumerate(similarity_percentage)), reverse=True, key=lambda x:x[1])[1:6]
    
    products = []
    for index in recommended_product:
        products.append(df_recommend.iloc[index[0]]['ProductName'])
        
    return products

In [None]:
recommender("Pecan Shortbread cookies")

# Pipeline that recommends products on the basis of preference

In [None]:
def pipeline(pref):
    # creating a dataframe
    df = pd.DataFrame({"Preference": [pref]})
    
    # cleaning the preference
    df['Preference_Clean'] = df['Preference'].apply(clean_pref)
    
    # transform the preference into vectors
    df_vec = tfidf_predict.transform(df['Preference_Clean']).toarray()
    
    # predict the category
    pred = svc.predict(df_vec)[0]
    print("Predicted Tag:", pred)
    
    # predict probability
    pred_prob = svc.predict_proba(df_vec)[0]
    
    # dictionary of category with its probability
    category_prob = {}
    for i in range(len(pred_prob)):
        category_prob[svc.classes_[i]] = pred_prob[i]
    
    # sort the dictionary to get the max two
    category_prob_sort = dict(sorted(category_prob.items(), key=lambda item: item[1], reverse=True))
    pred_two = list(category_prob_sort.keys())[:2]
    print("\nProduct tag sorted:", category_prob_sort)
    
    # fecthing all the products which has category as product
    df_product = df_recommend[df_recommend['ProductTag'].apply(lambda product_tag: pred in product_tag.lower())]
    df_product = df_product[['ProductId', 'ProductName', 'Cost', 'Price']]
    
    product_name = df_product.iloc[0]['ProductName']
    print("\nProduct Name:", product_name)
    
    # recommend product similar to product name
    print("\nRecommended products:")
    print("--------------------------")
    recommends = recommender(product_name)
    for recommend in recommends:
        print(recommend)

# Check with test data

In [None]:
df_test_sample = pd.read_csv("data/test_input.csv")
df_test_sample.head()

In [None]:
df_test_sample = df_test_sample[['OrderPrice', 'Preferences']]
df_test_sample.head()

In [None]:
test_str = df_test_sample['Preferences'][8]
test_str

In [None]:
pipeline(test_str)