In [None]:
import pandas as pd
import numpy as np
from PIL import Image
from io import BytesIO
import cv2
from skimage.measure import compare_ssim
import json

### Reading the Raw Data

In [None]:
data = pd.read_csv('C:\\Users\\vthirumala\\Downloads\\Fishes\\Case Studies\\infilect\\1 Rawdata\\small-2oq-c1r.csv',error_bad_lines=False)

### 1 Filtering Tunic

In [None]:
def get_tunic(x):
    return x.endswith('Tunics')
data['Tunic_Filter'] = data['categories'].astype(str).apply(get_tunic)
data_tunics = data[data['Tunic_Filter']]
del data['Tunic_Filter']
del data_tunics['Tunic_Filter']

### 2 Dropping identical/duplicate rows

In [None]:
data_tunics = data_tunics.drop_duplicates(subset=['productId'])

### 3 Dropping redundant and unnecessary columns

In [None]:
data_tunics = data_tunics.drop(['description','categories','sellingPrice', 'specialPrice', 'productUrl', 'productBrand', 
                                'productFamily','inStock', 'codAvailable','offers', 'discount', 'shippingCharges', 'keySpecsStr',
                                'deliveryTime', 'sizeUnit','storage','displaySize', 'specificationList', 'sellerName', 
                                'sellerAverageRating', 'sellerNoOfRatings','sellerNoOfReviews', 'sleeve', 'idealFor', 'neck'], axis=1)#, inplace=True)

### 4 Creating the 8-Character column

In [None]:
data_tunics['pId_08'] = data_tunics['productId'].str[:8]

### 5 Getting the unique 8-Character Groups 

In [None]:
pid_first_8_chars_unique = data_tunics['pId_08'].value_counts()

### 6 Getting the unique 8-Character Groups where each group has atleast 2 productIds

In [None]:
pid_first_8_chars_unique = pid_first_8_chars_unique.index[data_tunics['pId_08'].value_counts()>1]

### 7 Extracting image data using imageUrlStr

In [None]:
#  This numpy array data of image will be used for comparing similarity between two images.
#  This part of code is most time consuming, as it downloads image from the internet and then converts the image to numpy array.
def get_img(url):
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    img = np.array(img)
    img = cv2.resize(img, (20, 50))
    return img

data_tunics['image_data']=''
for i in np.arange(0,len(data_tunics)):
    try:
        data_tunics.at[i, 'image_data'] = get_img(data_tunics.loc[i,'imageUrlStr'].split(';')[0])
    except:
        continue

### 8 Creating function for calculating similarity score

In [None]:
def calculate_score(row1, row2):
    
    #Gives similarity score of image url as 0 or 1 
    image_score = float(row1['imageUrlStr'].strip(';')[0] == row2['imageUrlStr'].strip(';')[0])
    try:
        image_score = compare_ssim(row1['image_data'], row2['image_data'], multichannel=True)
    except:
        #In few of the cases, the image data could not be captured properly. 
        #In those cases, image_score is taken as 0.5
        image_score = 0.5
    
    #Gives similarity score of mrp as 0 or 1
    mrp_score = float(row1['mrp'] == row2['mrp'])

    #Gives similarity score of title as 0 or 1
    title_score = float(row1['title'] == row2['title'])

    #Gives similarity score of detailedSpecStr as 0 or 1
    detailedSpecStr_score = float(row1['detailedSpecsStr'] == row2['detailedSpecsStr'])

    #Return mean of all the individual scores calculated upto 6 decimal places
    return np.mean([image_score, mrp_score, title_score, detailedSpecStr_score]).round(6)

### 9 Iterative Comparison

In [None]:
master_dictionary={}

for pid in pid_first_8_chars_unique:

    #Filtering data for each group

    data = data_tunics[data_tunics['pId_08']==pid]
    
    for i in range(len(data)-1):
        
        dictionary = {}
        #key of the dictionary object is ith product id of the group. 

        similar_pids = []
        #Similar_pids will have productIds of duplicate products along with similarity score

        for j in np.arange(i+1,len(data)):
        
            score = calculate_score(data.iloc[i,:], data.iloc[j,:])
            
            similar_pids.append(data.iloc[j,0])

            similar_pids.append(score)
            
        similar_pids = np.array(similar_pids).reshape(int(len(similar_pids)/2),2)
        #Converting the 1-dimensional list to a 2 dimensional array
        
        b = similar_pids[similar_pids[:,1].astype(float)>0.5,:]
        #Filtering the numpy array to only have values with probability more than 0.5
        try:
            
            b = b[-1*min(-b[:,1].astype(float).argsort())+-b[:,1].astype(float).argsort()].tolist()
            
        except:

            b = []
            
        dictionary[data.iloc[i,0]] = b
        
        master_dictionary.update(dictionary)

### 10 Saving result in JSON format

In [None]:
filename = 'C:\\Users\\vthirumala\\Downloads\\Fishes\\Case Studies\\infilect\\result.json'
json.dump(master_dictionary, open(filename, 'w'), indent=3)