# Big Data Mining, 1st Assignment solution
## Vangelis Christou, p2821805



In [1]:
#read
import pandas as pd 
df = pd.read_csv("groceries.csv",sep=";") 

#preview data
df.head()


Unnamed: 0,Customer_ID,Age,Sex,Marital_Status,Education,Income,Customer_Rating,Persons_in_Household,Occupation,Groceries
0,1,75,male,married,primary,20000,very_good,3,retired,"citrus fruit,semi-finished bread,margarine,rea..."
1,2,61,female,single,secondary,28000,good,1,housemaid,"tropical fruit,yogurt,coffee"
2,3,32,male,single,secondary,34000,very_good,1,blue-collar,whole milk
3,4,62,male,married,primary,31000,very_good,3,blue-collar,"pip fruit,yogurt,cream cheese,meat spreads"
4,5,66,female,married,secondary,19000,good,3,retired,"other vegetables,whole milk,condensed milk,lon..."


In [2]:
#view default  data types
df.dtypes

Customer_ID              int64
Age                     object
Sex                     object
Marital_Status          object
Education               object
Income                  object
Customer_Rating         object
Persons_in_Household     int64
Occupation              object
Groceries               object
dtype: object

In [3]:
import numpy as np
#replace ' ' values with NaN
df = df.replace(r'^\s*$', np.nan, regex=True)

#set proper data types
#numeric
df["Age"] = pd.to_numeric(df["Age"],downcast='integer')
df["Income"] = pd.to_numeric(df["Income"],downcast='integer')
df["Persons_in_Household"] = pd.to_numeric(df["Persons_in_Household"])

#nominal
df["Sex"] = df["Sex"].astype('category')
df["Marital_Status"] = df["Marital_Status"].astype('category')
df["Occupation"] = df["Occupation"].astype('category')

#ordinal
#for the ordinal columns we will replace with an enumeration to calculate similarity as numeric

cRatings = df["Customer_Rating"] = pd.Categorical(df["Customer_Rating"], categories=["poor", "fair", "good", "very_good", "excellent"], ordered=True)

cRatings = cRatings.replace("poor",1).replace("fair",2).replace("very_good",4).replace("good",3).replace("excellent",5)
df["Customer_Rating_enum"] = cRatings.astype(np.int)


cEducation = df["Education"] = pd.Categorical(df["Education"], categories=["primary", "secondary", "tertiary"], ordered=True)
cEducation = cEducation.replace("primary",0).replace("secondary",1).replace("tertiary",2)
df["Education_enum"] = cEducation.astype(np.int)



#fill Nan values with mean
df.fillna(df.mean().astype(int), inplace=True)
df["Age"] = df["Age"].astype(np.int) 
df["Income"] = df["Income"].astype(np.int) 

# convert Groceries to set of values
df.Groceries= df.Groceries.str.split(",") 

df.dtypes





Customer_ID                int64
Age                        int32
Sex                     category
Marital_Status          category
Education               category
Income                     int32
Customer_Rating         category
Persons_in_Household       int64
Occupation              category
Groceries                 object
Customer_Rating_enum       int32
Education_enum             int32
dtype: object

In [4]:

df.head(15)

Unnamed: 0,Customer_ID,Age,Sex,Marital_Status,Education,Income,Customer_Rating,Persons_in_Household,Occupation,Groceries,Customer_Rating_enum,Education_enum
0,1,75,male,married,primary,20000,very_good,3,retired,"[citrus fruit, semi-finished bread, margarine,...",4,0
1,2,61,female,single,secondary,28000,good,1,housemaid,"[tropical fruit, yogurt, coffee]",3,1
2,3,32,male,single,secondary,34000,very_good,1,blue-collar,[whole milk],4,1
3,4,62,male,married,primary,31000,very_good,3,blue-collar,"[pip fruit, yogurt, cream cheese, meat spreads]",4,0
4,5,66,female,married,secondary,19000,good,3,retired,"[other vegetables, whole milk, condensed milk,...",3,1
5,6,55,female,single,secondary,35000,very_good,1,unemployed,"[whole milk, butter, yogurt, rice, abrasive cl...",4,1
6,7,23,female,married,tertiary,21000,good,3,housemaid,[rolls/buns],3,2
7,8,26,female,single,secondary,30000,good,2,blue-collar,"[other vegetables, UHT-milk, rolls/buns, bottl...",3,1
8,9,29,female,married,secondary,32000,very_good,3,blue-collar,[potted plants],4,1
9,10,57,female,married,secondary,26000,good,3,entrepreneur,"[whole milk, cereals]",3,1


In [5]:
import itertools
import time

def jaccard_sim(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return round(float(intersection) / union, 2)

def numeric_sim(num1, num2, maxvalue,minvalue):
    return 1- abs(num1-num2) / (maxvalue - minvalue)

def nominal_sim(nom1,nom2):
    return int(nom1 == nom2)


   

def pairwiseJaccard(cust, appendTo):
    sim={}
    print("Computing Jaccard similarities")
    start_time = time.time()
    for pair in itertools.combinations(cust, r=2):
        s=jaccard_sim(cust[pair[0]],cust[pair[1]])
        appendTo[tuple([pair[0],pair[1]])] = [s]

    end_time = time.time()
    print("Jaccard Success after: " ,end_time-start_time)
    return appendTo

def pairwiseNominal(cust, appendTo):
    sim={}
    print("Computing Nominal similarities")
    start_time = time.time()
    for pair in itertools.combinations(cust, r=2):
        s=nominal_sim(cust[pair[0]],cust[pair[1]])
        dict(appendTo)[tuple([pair[0],pair[1]])].append(s)



    end_time = time.time()
    print("Nominal Success after: " ,end_time-start_time)
    return appendTo

def pairwiseNumeric(cust, maxvalue,minvalue):
    sim={}
    print("Computing numeric similarities")
    start_time = time.time()
    for pair in itertools.combinations(cust, r=2):
        s=numeric_sim(cust[pair[0]],cust[pair[1]],maxvalue,minvalue)
        #print(cust[pair[0]],cust[pair[1]],maxvalue,minvalue,"=", s,"\n")
        sim[tuple([pair[0],pair[1]])]=s
    
    end_time = time.time()
    print("Numeric Success after: " ,end_time-start_time)
    return sim


In [6]:
#numeric
age_d = dict(zip(df.Customer_ID, df.Age))
persons_d = dict(zip(df.Customer_ID, df.Persons_in_Household))
income_d = dict(zip(df.Customer_ID, df.Income))

#nominal
sex_d = dict(zip(df.Customer_ID, df.Sex))
occupation_d = dict(zip(df.Customer_ID, df.Occupation))
marital_d = dict(zip(df.Customer_ID, df.Marital_Status))

#ordinal
customer_rating_d = dict(zip(df.Customer_ID, df.Customer_Rating_enum))
education_enum_d = dict(zip(df.Customer_ID, df.Education_enum))

#set
groceries_d = dict(zip(df.Customer_ID, df.Groceries))


#helper test dictionaries
#short_groceries_d = dict(list(groceries_d.items())[0:1000])
#short_sex_d = dict(list(sex_d.items())[0:1000])
#short_cust_rating = dict(list(customer_rating_d.items())[0:1000])
#numeric
dsize = 1000

age_d = dict(list(age_d.items())[0:dsize])
persons_d = dict(list(persons_d.items())[0:dsize])
income_d = dict(list(income_d.items())[0:dsize])

sex_d = dict(list(sex_d.items())[0:dsize])
occupation_d = dict(list(occupation_d.items())[0:dsize])
marital_d = dict(list(marital_d.items())[0:dsize])

customer_rating_d = dict(list(customer_rating_d.items())[0:dsize])
education_enum_d = dict(list(education_enum_d.items())[0:dsize])
groceries_d = dict(list(groceries_d.items())[0:dsize])




In [18]:
###Calculate Similarities

#numeric
#simAge = pairwiseNumeric(age_d, df.Age.max(),df.Age.min())
#simHouseholdPersons = pairwiseNumeric(persons_d,df.Persons_in_Household.max(),df.Persons_in_Household.min())
#simIncome = pairwiseNumeric(income_d,df.Income.max(),df.Income.min())

#nominal
#simSex = pairwiseNominal(sex_d)
#simOccupation = pairwiseNominal(occupation_d)
#simMaritalStatus = pairwiseNominal(marital_d)


#ordinal
#simCustomerRating = pairwiseNumeric(customer_rating_d,df.Customer_Rating_enum.max(), df.Customer_Rating_enum.min())
#simEducation = pairwiseNumeric(education_enum_d,df.Education_enum.max(), df.Education_enum.min())

#set
from collections import defaultdict
dictSim = defaultdict(list)
simGroceries = pairwiseJaccard(groceries_d, dictSim)
simMaritalStatus = pairwiseNominal(marital_d, dictSim)

dictSim




Computing Jaccard similarities
Jaccard Success after:  1.0868122577667236
Computing Nominal similarities


KeyboardInterrupt: 

In [8]:
import operator

def GetSimilarCustomers(customerID,head,sim):
    filtered_dict = {k:v for (k,v) in sim.items() if customerID in k}
    sorted_d = dict(sorted(filtered_dict.items(), key=operator.itemgetter(1),reverse=True))
    return dict(list(sorted_d.items())[0:head])


def GetDictionaryWithAvgSimilarity(listSim):
    d0=listSim[0]
    i=1;
    while i < len(listSim) :
        for k, value in d0.items():
            d0[k] = float((d0[k] + listSim[i][k]) / 2)

        i+=1
    return d0



In [9]:


#SimilarityList = [simGroceries, simSex]

SimilarityList = [simAge,simHouseholdPersons,simIncome, simMaritalStatus, simGroceries, simSex,simOccupation, simCustomerRating, simEducation]



avgSim =  GetDictionaryWithAvgSimilarity(SimilarityList)

customerids = [73,563]
for i in customerids: 
    print(GetSimilarCustomers(i,10,avgSim))
    print("----\n")


#73, 563, 1603, 2200, 3703, 4263, 5300, 6129, 7800, 8555

  
# printing final result 





{(66, 73): 0.9555250789322618, (73, 347): 0.9553941231343284, (73, 143): 0.9552517580367395, (73, 468): 0.9535323981056257, (73, 872): 0.9534145378874856, (73, 797): 0.953256565729047, (73, 529): 0.9519230769230769, (73, 638): 0.9517291547072331, (73, 328): 0.9512799583811711, (73, 480): 0.9510330080367394}
----

