In [1]:
import time
import warnings
warnings.filterwarnings('ignore')

# For loading the dataframe
import pickle

# For loading data and data manipulation
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

# For cleaning the text
from nltk.corpus import stopwords
from nltk import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

# For converting them into vectors
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# To find the similarity distance between movies
from sklearn.metrics.pairwise import cosine_similarity

# For model building libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Loading the datasets

In [2]:
df_predict = pickle.load(open("eda_data/predict.pkl", "rb"))
df_predict.head()

Unnamed: 0,Preferences,Category,Category_Num
0,I’m more of a tea person 🍵$$$I prefer herbal/d...,decaf,16
1,I’m more of a tea person 🍵$$$I prefer herbal/d...,sweet,3
2,"I don’t drink coffee or tea 🤭$$$ Chocolates, C...",hard liquor,7
3,"I don’t drink coffee or tea 🤭$$$ Chocolates, C...",accessory,2
4,"I don’t drink coffee or tea 🤭$$$ Chocolates, C...",LiquorAccessory,1


# Clean the preference

In [3]:
# Creating object of tokenizer and lemmatizer
tokenizer = RegexpTokenizer(r"\w+")
lemmatizer = WordNetLemmatizer()

In [4]:
# Function to clean the Preference column
def clean_pref(row):
    # convert into lower character
    row = row.lower()
    # split string into words
    words = tokenizer.tokenize(row)
    # remove stop words and apply lemmatization
    words = [lemmatizer.lemmatize(word.strip()) for word in words if word not in stopwords.words('english')]
    # convert word into string again and return
    return ' '.join(words)

In [5]:
%%time
# Applying clean preference function to transform the text
df_predict['Preference_Clean'] = df_predict['Preferences'].apply(clean_pref)

CPU times: user 8.34 s, sys: 1.44 s, total: 9.78 s
Wall time: 9.78 s


In [6]:
df_predict.head()

Unnamed: 0,Preferences,Category,Category_Num,Preference_Clean
0,I’m more of a tea person 🍵$$$I prefer herbal/d...,decaf,16,tea person prefer herbal de caffeinated tea ch...
1,I’m more of a tea person 🍵$$$I prefer herbal/d...,sweet,3,tea person prefer herbal de caffeinated tea ch...
2,"I don’t drink coffee or tea 🤭$$$ Chocolates, C...",hard liquor,7,drink coffee tea chocolate candy baked treat s...
3,"I don’t drink coffee or tea 🤭$$$ Chocolates, C...",accessory,2,drink coffee tea chocolate candy baked treat s...
4,"I don’t drink coffee or tea 🤭$$$ Chocolates, C...",LiquorAccessory,1,drink coffee tea chocolate candy baked treat s...


# Split the data into train and test

In [7]:
X = df_predict['Preference_Clean']
y = df_predict['Category']

X.shape, y.shape

((3338,), (3338,))

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2670,), (668,), (2670,), (668,))

# Converting feature into vectors

In [9]:
tfidf_predict = TfidfVectorizer(max_features=3000, stop_words='english')

In [10]:
X_train_vec = tfidf_predict.fit_transform(X_train).toarray()
X_test_vec = tfidf_predict.transform(X_test).toarray()

In [11]:
X_train_vec.shape

(2670, 569)

# Creating a model

In [12]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier(n_neighbors=97)
bnb = BernoulliNB()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)

In [13]:
%%time
model_perf = []
classifiers = [svc, knc, bnb, mnb, dtc, lrc, rfc, abc, bc, etc, gbdt]
for classifier in classifiers:
    classifier.fit(X_train_vec, y_train)
    y_pred = classifier.predict(X_test_vec)
    accuracy = accuracy_score(y_test, y_pred)
    model_perf.append({'Model': classifier, 'Accuracy': accuracy})

CPU times: user 46.1 s, sys: 1.54 s, total: 47.7 s
Wall time: 44.2 s


### Model performance (Logistic Regression works the best)

In [14]:
pd.DataFrame(model_perf)

Unnamed: 0,Model,Accuracy
0,"SVC(gamma=1.0, kernel='sigmoid')",0.184132
1,KNeighborsClassifier(n_neighbors=97),0.154192
2,BernoulliNB(),0.157186
3,MultinomialNB(),0.14521
4,DecisionTreeClassifier(max_depth=5),0.146707
5,"LogisticRegression(penalty='l1', solver='libli...",0.194611
6,"(DecisionTreeClassifier(max_features='auto', r...",0.007485
7,"(DecisionTreeClassifier(max_depth=1, random_st...",0.163174
8,(DecisionTreeClassifier(random_state=114693939...,0.010479
9,"(ExtraTreeClassifier(random_state=1872583848),...",0.008982


# Creating recommendation model

In [15]:
df_recommend = pickle.load(open("eda_data/recommendation.pkl", 'rb'))
df_recommend.head()

Unnamed: 0,ProductId,ProductName,ProductTag,Rating,Cost,Price,Tag
0,245,Guatemalan Signature Roast Ground Coffee,"coffee, ground",4.4427,13.78,25.99,guatemalan signature roast ground coffee coffe...
1,278,Los Gigantes Colombia (Coffee Beans),"coffee, artisanal",4.3185,10.75,20.0,los gigantes colombia coffee bean coffee artis...
2,351,Rowkin Bit Charge Stereo (Silver),music,4.0,64.0,102.99,rowkin bit charge stereo silver music
3,424,Chocolate Fruit and Nut Dragee Sampler,sweet,4.4643,21.0,35.0,chocolate fruit nut dragee sampler sweet
4,700,Pro Lens Kit for Phone Camera,photography,4.3588,24.5,42.99,pro lens kit phone camera photography


### Converting tags column into vectors

In [16]:
tfidf_recommend = TfidfVectorizer(max_features=5000, stop_words='english')

In [17]:
recommend_vec = tfidf_recommend.fit_transform(df_recommend['Tag']).toarray()
recommend_vec

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
recommend_vec.shape

(341, 781)

### Calculate the cosine distance between the vectors, and the values of matrix represent the percentage of similarity between tags

In [19]:
similarity = cosine_similarity(recommend_vec)
similarity

array([[1.        , 0.25549427, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.25549427, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.87744968],
       [0.        , 0.        , 0.        , ..., 0.        , 0.87744968,
        1.        ]])

In [20]:
similarity.shape

(341, 341)

### Recommender function that will give the recommended movies closest to the given one

In [21]:
def recommender(product):
    product_index = df_recommend[df_recommend['ProductName'] == product].index[0]
    similarity_percentage = similarity[product_index]
    recommended_product = sorted(list(enumerate(similarity_percentage)), reverse=True, key=lambda x:x[1])[1:6]
    
    products = []
    for index in recommended_product:
        products.append(df_recommend.iloc[index[0]]['ProductName'])
        
    return products

In [22]:
recommender("Pecan Shortbread cookies")

['Vanilla Bean Shortbread Cookies',
 'The Luxe Collection Cocoa Shortbread',
 'Chocolate Cacao Nib Shortbread',
 'Original Speculoos Cookies- 4.8oz',
 'The Luxe Collection Almond Butter Cookies']

# Pipeline that recommends products on the basis of preference

In [23]:
def pipeline(pref):
    # creating a dataframe
    df = pd.DataFrame({"Preference": [pref]})
    
    # cleaning the preference
    df['Preference_Clean'] = df['Preference'].apply(clean_pref)
    print("Preference after clean:", df['Preference_Clean'].values)
    
    # transform the preference into vectors
    df_vec = tfidf_predict.transform(df['Preference_Clean']).toarray()
    
    # predict the category
    # pred = lrc.predict(df_vec)[0]
    
    # predict probability
    pred_prob = lrc.predict_proba(df_vec)[0]
    
    # dictionary of category with its probability
    category_prob = {}
    for i in range(len(pred_prob)):
        category_prob[lrc.classes_[i]] = pred_prob[i]
    
    # sort the dictionary to get the max two
    category_prob_sort = dict(sorted(category_prob.items(), key=lambda item: item[1], reverse=True))
    pred_two = list(category_prob_sort.keys())[:2]
    print("\nProbability tag:", pred_two)
    
    # fecthing all the products which has category as product
    df_product = df_recommend[df_recommend['ProductTag'].apply(lambda product_item: pred_two[0] in product_item)]
    df_product = df_product[['ProductId', 'ProductName', 'Cost', 'Price']]
    
    print("\n", df_product['ProductName'])

# Check with test data

In [24]:
df_test_sample = pd.read_csv("data/test_input.csv")
df_test_sample.head()

Unnamed: 0.1,Unnamed: 0,OrderID,OrderPrice,Preferences
0,1001,171888537,40,I'm a coffee person ☕️$$$I love to make pour o...
1,1002,171888536,40,I'm a coffee person ☕️$$$I love to make pour o...
2,1003,171888535,40,"I don’t drink coffee or tea 🤭$$$ Chocolates, C..."
3,1004,171888533,30,I'm a coffee person ☕️$$$I love to make pour o...
4,1005,171888532,50,I’m more of a tea person 🍵$$$Any specialty tea...


In [25]:
df_test_sample = df_test_sample[['OrderPrice', 'Preferences']]
df_test_sample.head()

Unnamed: 0,OrderPrice,Preferences
0,40,I'm a coffee person ☕️$$$I love to make pour o...
1,40,I'm a coffee person ☕️$$$I love to make pour o...
2,40,"I don’t drink coffee or tea 🤭$$$ Chocolates, C..."
3,30,I'm a coffee person ☕️$$$I love to make pour o...
4,50,I’m more of a tea person 🍵$$$Any specialty tea...


In [26]:
pipeline(df_test_sample['Preferences'][8])

Preference after clean: ['tea person specialty tea good savory non sweet snack allergy dietary restriction send like everything enjoy capturing picture amp video smart home gadget cool love send oh yeah l']

Probability tag: ['tea', 'LiquorAccessory']

 35                Mini Signature Box (Five Pyramid Bags)
62                           "Home Sweet Home" Tea Towel
73     Petite Presentation Box Warming Joy (Pyramid B...
88                         Classic Wooden Assortment Box
89                                Citrus Ginger Cylinder
110                                       Sparkling Rose
111                                        Birthday Cake
112                          Unicorn Tears (unsweetened)
194                     Insulated Bottle with Wooden Cap
235                         Relax Tea Pyramid Bags (TIN)
236                         Sleep Tea Pyramid Bags (TIN)
237                      Immunity Tea Pyramid Bags (TIN)
316                 Chamomile - Caffeine Free Teabag Tin
317  