In [1]:
import pandas as pd
import numpy as np
import re

pd.set_option('display.max_columns', 999)

In [2]:
# watch_csv = pd.read_csv('watch_data/watch_base_latest.csv')
watch_csv = pd.read_csv('watch_data_final/final 2.csv')

In [3]:
#columns we will drop because they are not visually advantageous to classify by
drop_col = [
    'brand',
    'reference',
    'name',
    'produced',
    'limited',
    'caseDiameter',
    'caseHeight',
    'caseLugWidth',
    'dialMaterial',
    'movementBrand',
    'movementCaliber',
    'movementDiameter',
    'movementJewels',
    'movementReserve',
    'movementFrequency',
    'description'
]

watch_csv.drop(drop_col, 
               axis=1,
               inplace=True
              )

In [4]:
watch_csv.drop('imagePath', axis=1, inplace=True)

In [5]:
# watch_csv.head()
watch_csv.movementDisplay.unique()

array(['Analog', 'Digital', 'Analog/ Digital'], dtype=object)

In [6]:
watch_csv['caseMaterial'] = 'case material - ' + watch_csv['caseMaterial']
watch_csv['caseGlass'] = 'case glass - ' + watch_csv['caseGlass']
watch_csv['caseBack'] = 'case back - ' + watch_csv['caseBack']
watch_csv['dialColor'] = 'dial color - ' + watch_csv['dialColor']
watch_csv['dialHands'] = 'dial hands - ' + watch_csv['dialHands']

In [7]:
print("max length\n", len(watch_csv))

#shows the columns with NA values
has_na_vals = []
for column in watch_csv:
    check_na = watch_csv[watch_csv[column].isna()]
    if check_na.empty == False:
        has_na_vals.append([column,len(check_na)])
print("\nColumns with NA values\n", has_na_vals)

max length
 842

Columns with NA values
 [['caseMaterial', 36], ['caseGlass', 18], ['caseBack', 10], ['caseShape', 47], ['dialIndexes', 2], ['dialHands', 6], ['movementTime', 1]]


In [8]:
#we can actually input these missing labels into the dataset if we want.
#however, to speed this up, i am omitting the rows with NA data

for i in has_na_vals:
    with_na = watch_csv[watch_csv[i[0]].isna()]
    watch_csv.drop(with_na.index, axis=0, inplace=True)
#     watch_csv.reset_index(inplace=True, drop=True)

In [9]:
#clean up all special characters for all columns but image link
for column in watch_csv:
    if column != "imageLink":
        watch_csv[column] = watch_csv[column].map(lambda x: re.sub('[^A-Za-z0-9-]+ ', '',x))    
        watch_csv[column] = watch_csv[column].str.slice(0,31,1)

In [10]:
#'update' the new set of rows we know we need to iterate over
new_rows = []

for index, row in watch_csv.iterrows():
    new_rows.append(index)

In [11]:
#update the url link
for i in new_rows:
    splitter = ' gs://ram594/final_images/' + watch_csv.loc[i,"imageLink"].split('/')[-1]
#     splitter = 'gs://watches_draft_2/images_latest/' + watch_csv.loc[i,"imageLink"].split('/')[-1]
    watch_csv.loc[i,'imageLink'] = splitter

In [12]:
link = watch_csv['imageLink']
watch_csv.drop('imageLink', axis=1, inplace=True)
watch_csv.insert(0, 'googleLink', link)

In [13]:
#unable to diagnose issue with duplicate google links pulling same watch, so we will need to 
#cut duplicates out of the dataset
#remove any duplicates of google links

watch_csv.drop_duplicates(subset='googleLink', keep='first', inplace=True)

In [14]:
##### need to loop this maybe 10 times
#we are doing this loop to ensure that each label has at least 10 rows, otherwise autoML will deny

for i in range(0,10):
    items_count = {} #create the dictionary that will hold the columns
    for column in watch_csv:
        if column != 'googleLink':
            items_count[column] = {} #create the second dictionary to hold the count of labels
            for i in watch_csv[column]:
                if i not in items_count[column]:
                    items_count[column][i] = 1 #if label not already in dict, add
                else:
                    items_count[column][i] +=1 #else, add to the num

    #figure out what needs to be removed
    to_remove = []
    for column in items_count:
        for label in items_count[column]:
            if items_count[column][label] < 10:
                to_remove.append([column, label])

    #do the actual removing
    print("DELETE THE FOLLOWING...\n")
    for i in to_remove:
        print("NEW LINE: ",i[0], "LABEL: ", i[1])
        watch_csv = watch_csv[~watch_csv[i[0]].isin([i[1]])]

DELETE THE FOLLOWING...

NEW LINE:  family LABEL:  Engineer Hydrocarbon
NEW LINE:  family LABEL:  Crash de Cartier
NEW LINE:  caseMaterial LABEL:  case material - Yellow gold
NEW LINE:  caseMaterial LABEL:  case material - Ceramic
NEW LINE:  caseMaterial LABEL:  case material - Rose Gold
NEW LINE:  caseMaterial LABEL:  case material - Honey Gold
NEW LINE:  caseGlass LABEL:  case glass - Mineral
NEW LINE:  caseBack LABEL:  case back - Hinged
NEW LINE:  caseBack LABEL:  case back - Hunter
NEW LINE:  caseShape LABEL:  Other
NEW LINE:  dialColor LABEL:  dial color - Red
NEW LINE:  dialColor LABEL:  dial color - Brown
NEW LINE:  dialColor LABEL:  dial color - Cream
NEW LINE:  dialColor LABEL:  dial color - Grey
NEW LINE:  dialColor LABEL:  dial color - Sapphire
NEW LINE:  dialColor LABEL:  dial color - Green
NEW LINE:  dialColor LABEL:  dial color - Beige
NEW LINE:  dialColor LABEL:  dial color - Paved
NEW LINE:  dialColor LABEL:  dial color - Multi-Color
NEW LINE:  dialColor LABEL:  dial c

In [15]:
watch_csv.to_csv(index=False, path_or_buf='watch_data_final/watch_csv.csv')

In [16]:
watch_csv.index

Int64Index([  0,   3,   4,   6,   8,   9,  10,  11,  12,  15,
            ...
            823, 824, 826, 827, 828, 830, 831, 834, 835, 841],
           dtype='int64', length=404)

## FINDING COSINE SIMILARITY FOR RECOMMENDATIONS

In [17]:
#make a new copy df 
cos_df = watch_csv.copy(deep=True)

In [18]:
#we arent going to use google link to find similarity, so drop it
cos_df.drop('googleLink', axis=1, inplace=True)

In [19]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
#have the watches as their set of labels and then make them a string for comparison
watches = []

cols = cos_df.columns

for row in range(0,len(cos_df.index)): #go through the rows
    watch = ""
    for column in cols:
        watch = watch + cos_df.iloc[row][column] + " "
    watches.append(watch)
    

In [21]:
cos_df.head()

Unnamed: 0,family,caseMaterial,caseGlass,caseBack,caseShape,dialColor,dialIndexes,dialHands,movementType,movementDisplay,movementTime
0,Pilot,case material - Stainless steel,case glass - Sapphire,case back - Open,Round,dial color - Black,Arabic Numerals,dial hands - Sword,Automatic,Analog,HoursMinutesSmall Seconds
3,Instruments,case material - Stainless steel,case glass - Sapphire,case back - Closed,Square,dial color - Black,Mixed,dial hands - Sword,Automatic,Analog,HoursMinutesSmall Seconds
4,Transocean,case material - Stainless Steel,case glass - Sapphire,case back - Closed,Round,dial color - Black,StickDot,dial hands - Stick,Automatic,Analog,HoursMinutesSmall Seconds
6,Transocean,case material - Stainless Steel,case glass - Sapphire,case back - Closed,Round,dial color - Black,StickDot,dial hands - Stick,Automatic,Analog,HoursMinutesSeconds
8,Pilot,case material - Stainless steel,case glass - Sapphire,case back - Open,Round,dial color - Black,Arabic Numerals,dial hands - Sword,Automatic,Analog,HoursMinutesSmall Seconds


In [22]:
#example
watches[0]

'Pilot case material - Stainless steel case glass - Sapphire case back - Open Round dial color - Black Arabic Numerals dial hands - Sword Automatic Analog HoursMinutesSmall Seconds '

In [23]:
#we are using tfidf vectorizer to find the count of words in each set of watch labels
tvec = TfidfVectorizer(stop_words='english')
sparse_matrix = tvec.fit_transform(watches)

In [24]:
#this helps visualize the actual term frequency that we are looking at 
#using tfidf, we are accounting for frequency across all watches
doc_frequency_matrix = sparse_matrix.todense()

df = pd.DataFrame(
    doc_frequency_matrix,
    columns = tvec.get_feature_names(),    
    index = watches
)

In [25]:
#example
df.head(1)

Unnamed: 0,12,analog,arabic,audemars,automatic,baton,black,blue,case,closed,color,dauphine,dial,digital,double,feuille,glass,gold,hands,handwound,hour,hoursminutes,hoursminutesadditional,hoursminutesseconds,hoursminutessecondsworld,hoursminutessmall,instruments,jules,material,mixed,numerals,open,outdoor,pilot,pink,processor,proprietary,quartz,rectangular,red,reverso,round,sapphire,seconds,secondsworld,silver,square,stainless,steel,stick,stickdot,sword,time,transocean,watch,white
Pilot case material - Stainless steel case glass - Sapphire case back - Open Round dial color - Black Arabic Numerals dial hands - Sword Automatic Analog HoursMinutesSmall Seconds,0.0,0.117168,0.261654,0.0,0.147606,0.0,0.19466,0.0,0.335389,0.0,0.111796,0.0,0.223593,0.0,0.0,0.0,0.111796,0.0,0.111796,0.0,0.0,0.0,0.0,0.0,0.0,0.222096,0.0,0.0,0.111796,0.0,0.261654,0.264863,0.0,0.487972,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.159216,0.111796,0.247785,0.0,0.0,0.0,0.147226,0.147226,0.0,0.0,0.27749,0.0,0.0,0.0,0.0


In [26]:
print(cosine_similarity(df,df))

[[1.         0.51652991 0.6228633  ... 0.41017278 0.40255642 0.68880296]
 [0.51652991 1.         0.60817492 ... 0.40058947 0.39322251 0.5548681 ]
 [0.6228633  0.60817492 1.         ... 0.69100983 0.70414016 0.92207973]
 ...
 [0.41017278 0.40058947 0.69100983 ... 1.         0.53426996 0.63124632]
 [0.40255642 0.39322251 0.70414016 ... 0.53426996 1.         0.63873199]
 [0.68880296 0.5548681  0.92207973 ... 0.63124632 0.63873199 1.        ]]


## WORKING ON ACTUAL TFIDF FROM NEW SAMPLE WHEN IT IS GIVEN

### The code below is an example watch. the code under it is functions to assist

In [27]:
#transforms the labels into tfidf values
#returns a df of the tfidf values for each word
def get_tfidf_of_watch(labels:list):
    test_matrix = tvec.transform(test_watch)

    test_frequency_matrix = test_matrix.todense()

    test_df = pd.DataFrame(
        test_frequency_matrix,
        columns = tvec.get_feature_names(),    
        index = test_watch
    )
    return test_df
#     print(df)

In [28]:
#this gets us the most similar watches by cosine similarity and their indices in 'cos_df' 
def find_indexes_of_similar(cos_sim):
    
    comparison_list = []
    
    for i in range(0, len(cos_sim)):
        comparison_list.append((cos_sim[i], i))
    comparison_list.sort(reverse=True)
    
    return comparison_list
        

In [29]:
#grab the top n watches based on similar labels
def get_top_n_recommendations_indices(comparison_list, n):
    recommended_indices = []
    recommended_watches = []
    i = 0

    while (i < n):
        index = comparison_list[i][1] #get the index of the watch we will refer to
        
        #if we are looking at ourselves as a watch, ignore it and go to another one
        if index == 0:
            n = n + 1
            i = i + 1
            continue
        
        recommended_indices.append(index)
        i = i + 1
    
    original_csv = pd.read_csv('watch_data_final/final 2.csv')

    for i in recommended_indices:
        recommended_watches.append(original_csv.iloc[i]['name'])
        
    return recommended_watches

In [30]:
#an example about how the watch will work when it gets received 
test_watch = [watches[0]]
print("Watch labs:", test_watch,"\n")

#returns a df of the tfidf values for each word
test_tfidf = get_tfidf_of_watch(test_watch)

#get the cosine similarity between the new watch and all of our watches in the database
cos_sim = cosine_similarity(df, test_tfidf)

#this gets us the most similar watches by cosine similarity and their indices in 'cos_df' 
comparison_list = find_indexes_of_similar(cos_sim)

#grab the top 5 watches based on similar labels
recommended = get_top_n_recommendations_indices(comparison_list,5)

#this shows us our recommended
# print("\n\n", recommended)
for i in recommended:
    print("Recommended watch:",i)

Watch labs: ['Pilot case material - Stainless steel case glass - Sapphire case back - Open Round dial color - Black Arabic Numerals dial hands - Sword Automatic Analog HoursMinutesSmall Seconds '] 

Recommended watch: Reverso Lady Quartz
Recommended watch: Reverso Classic Large Duoface Small Seconds Stainless Steel / Silver
Recommended watch: Reverso Classic Large Small Seconds Stainless Steel / Silver / Fagliano
Recommended watch: Transocean Chronograph 1461 Stainless Steel / Black / Calf / Pin
Recommended watch: BR 03 92 Type Aviation Steel
