In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
from sklearn.utils import resample
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import f1_score as f1

In [2]:
data = pd.read_csv('cleaned_data.csv')
data.drop('Unnamed: 0', axis='columns', inplace=True)
data.sort_values(by=['TRANSACTION_DATE'], inplace=True)

# Train, Test, Split

In [3]:
train = data[:math.floor(.8*len(data))]
test = data[math.floor(.8*len(data)):]

In [4]:
print(sorted(train.TRANSACTION_DATE.unique())[:3], sorted(train.TRANSACTION_DATE.unique())[-3:])

['2009/01/02 05:00:00+00', '2009/01/03 05:00:00+00', '2009/01/04 05:00:00+00'] ['2018/09/08 04:00:00+00', '2018/09/09 04:00:00+00', '2018/09/10 04:00:00+00']


In [5]:
print(sorted(test.TRANSACTION_DATE.unique())[:3], sorted(test.TRANSACTION_DATE.unique())[-3:])

['2018/09/10 04:00:00+00', '2018/09/11 04:00:00+00', '2018/09/12 04:00:00+00'] ['2020/10/28 04:00:00+00', '2020/10/29 04:00:00+00', '2020/10/30 04:00:00+00']


# Convert to Matrix

In [6]:
agencies = sorted(data.AGENCY.unique())
vendors = sorted(data.VENDOR_NAME.unique())
agency_ids = {}
vendor_ids = {}
for i in range(len(agencies)):
    agency_ids[agencies[i]] = i
for i in range(len(vendors)):
    vendor_ids[vendors[i]] = i

In [7]:
train = train[["AGENCY_ID", "TRANSACTION_AMOUNT", "VENDOR_ID"]]
test = test[["AGENCY_ID", "TRANSACTION_AMOUNT", "VENDOR_ID"]]

In [8]:
def convert_to_matrix(df, rows, cols):
    matrix = [[0 for i in range(len(cols))] for j in range(len(rows))]
    for idx in df.index:
        matrix[idx[0]][idx[1]] = df.loc[idx].TRANSACTION_AMOUNT
    return matrix 

In [9]:
counts = train.groupby(['AGENCY_ID', 'VENDOR_ID']).count()
train_counts = pd.DataFrame(convert_to_matrix(counts, agency_ids.values(), vendor_ids.values()))

In [10]:
counts = test.groupby(['AGENCY_ID', 'VENDOR_ID']).count()
test_counts = pd.DataFrame(convert_to_matrix(counts, agency_ids.values(), vendor_ids.values()))

# Test Recommendations

In [11]:
cos_counts = cosine_similarity(train_counts)

In [12]:
pd.DataFrame(cos_counts)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,79,80,81,82,83,84,85,86,87,88
0,1.000000,0.068136,0.080668,0.526835,0.151007,0.526983,0.569407,0.020808,0.105257,0.534175,...,0.774991,0.502050,0.011826,0.672543,0.010237,0.640565,0.176044,0.442245,0.225285,0.319560
1,0.068136,1.000000,0.158139,0.294187,0.372166,0.510850,0.064115,0.474408,0.055798,0.120816,...,0.169362,0.222992,0.050724,0.074396,0.072682,0.127038,0.143595,0.381923,0.139603,0.264789
2,0.080668,0.158139,1.000000,0.125748,0.179328,0.204949,0.046233,0.246575,0.152228,0.102740,...,0.159157,0.273899,0.058661,0.189571,0.105481,0.100539,0.070347,0.162186,0.207033,0.233579
3,0.526835,0.294187,0.125748,1.000000,0.261977,0.656689,0.422708,0.374770,0.256287,0.541998,...,0.596516,0.416764,0.043146,0.464330,0.029044,0.703732,0.148270,0.492929,0.199174,0.323473
4,0.151007,0.372166,0.179328,0.261977,1.000000,0.385737,0.114777,0.293952,0.178592,0.225322,...,0.237978,0.362104,0.058102,0.208130,0.109644,0.171217,0.245557,0.306668,0.241813,0.304033
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,0.640565,0.127038,0.100539,0.703732,0.171217,0.478623,0.495655,0.131515,0.208365,0.556791,...,0.678639,0.507929,0.040272,0.597604,0.029198,1.000000,0.203210,0.379443,0.240190,0.303012
85,0.176044,0.143595,0.070347,0.148270,0.245557,0.195494,0.131094,0.030165,0.125298,0.180897,...,0.203593,0.260311,0.047700,0.178449,0.032980,0.203210,1.000000,0.134023,0.110448,0.242836
86,0.442245,0.381923,0.162186,0.492929,0.306668,0.652391,0.337213,0.381603,0.040144,0.343984,...,0.580623,0.443149,0.016636,0.422888,0.056162,0.379443,0.134023,1.000000,0.268236,0.389996
87,0.225285,0.139603,0.207033,0.199174,0.241813,0.249273,0.215142,0.175016,0.189289,0.187817,...,0.317581,0.502760,0.063515,0.356430,0.101448,0.240190,0.110448,0.268236,1.000000,0.309487


In [13]:
def make_recommendations(transactions, similarity, kind='user'):
    if kind == 'user':
        return similarity.dot(transactions) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif kind == 'item':
        return ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])

In [14]:
np_train_counts = train_counts.to_numpy()

In [15]:
recommendations = make_recommendations(np_train_counts, cos_counts)

In [16]:
for i in range(len(recommendations)): 
    for j in range(len(recommendations[i])): 
        recommendations[i][j]/=4

In [17]:
def get_mae(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mae(pred, actual)

In [18]:
np_test_counts = test_counts.to_numpy()

In [19]:
nonzero_mae = get_mae(recommendations, np_test_counts)
full_mae = mae(recommendations, np_test_counts)

In [20]:
full_mae

0.0329789789723071