In [None]:
from numba import jit
import pyodbc
import pandas as pd 
import numpy as np 
import scipy as sp
import random
import operator
import math
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
sns.set(color_codes=True)

In [None]:
conn = pyodbc.connect('Trusted_Connection=yes', driver = '{SQL Server Native Client 11.0}', server = 'sql5', database = 'canadahelps_prod_repl')
df = pd.read_sql_query('table name', conn)
df = df.fillna(0)

In [None]:
def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.ix[perm[:train_end]]
    validate = df.ix[perm[train_end:validate_end]]
    test = df.ix[perm[validate_end:]]
    
    train = train.reset_index()
    validate = validate.reset_index()
    test = test.reset_index()

    return train, validate, test

In [None]:
def euclideanDistance(instance1, instance2):    
    distance = 0
    diff = instance1.values - instance2.values
    for d in diff[0]:
        distance += pow(d,2)    
    return math.sqrt(distance)

In [None]:
def hammingDistance(instance1, instance2):
    distance = 0 
    product = instance1.values*instance2.values
    for elm in product[0]:
        distance += elm
    return distance

In [None]:
def distance(instance1, instance2):
    
    sizeClass_v1 = instance1.ix[:,2:5]
    sizeClass_v2 = instance2.ix[:,2:5]
        
    province_v1 = instance1.ix[:,5:18]
    province_v2 = instance2.ix[:,5:18]
    
    CMA_v1 = instance1.ix[:,18:51]
    CMA_v2 = instance2.ix[:,18:51]
    
    category_v1 = instance1.ix[:,51:60]    
    category_v2 = instance2.ix[:,51:60]
    
    daysRegistered_v1 = instance1.ix[:,60:61]
    daysRegistered_v2 = instance2.ix[:,60:61]
     
    activeAge_v1 = instance1.ix[:,61:62]
    activeAge_v2 = instance2.ix[:,61:62]
    
    activeDaysYr_v1 = instance1.ix[:,62:67]
    activeDaysYr_v2 = instance2.ix[:,62:67]
    
    noDonors_v1 = instance1.ix[:,67:72]    
    noDonors_v2 = instance2.ix[:,67:72]
    
    noDonations_v1 = instance1.ix[:,72:75]
    noDonations_v2 = instance2.ix[:,72:75]
    
    dollars_v1 = instance1.ix[:,75:80]    
    dollars_v2 = instance2.ix[:,75:80]
    
    dist = 0
    sizeClass_dist = 0
    province_dist = 0
    CMA_dist = 0
    category_dist = 0
    daysRegistered_dist = 0
    activeAge_dist = 0
    daysSinceLastTrx_dist = 0
    activeDaysYr_dist = 0
    noDonors_dist = 0
    noDonations_dist = 0
    dollars_dist = 0

    sizeClass_dist = 1/(hammingDistance(sizeClass_v1, sizeClass_v2))  
    province_dist = 1/(hammingDistance(province_v1, province_v2))
    CMA_dist = 1/(hammingDistance(CMA_v1, CMA_v2))   
    category_dist = 1/(hammingDistance(category_v1, category_v2))
    
    hammingDist = (0.33)*sizeClass_dist + (0.33)*category_dist + (0.33)*((0.167)*province_dist + (0.833)*CMA_dist)                             
                                  
    daysRegistered_dist = euclideanDistance(daysRegistered_v1,daysRegistered_v2)
    activeAge_dist = euclideanDistance(activeAge_v1, activeAge_v2)
    activeDaysYr_dist = euclideanDistance(activeDaysYr_v1, activeDaysYr_v2)
    noDonors_dist = euclideanDistance(noDonors_v1, noDonors_v2)
    noDonations_dist = euclideanDistance(noDonations_v1, noDonations_v2)
    dollars_dist = euclideanDistance(dollars_v1, dollars_v2)
    
    euclDist = (0.25)*daysRegistered_dist + (0.25)*activeAge_dist + (0.25)*activeDaysYr_dist \
                + (0.25)*((0.33)*noDonors_dist + (0.33)*noDonations_dist + (0.33)*dollars_dist)                                 
                                  
    dist = (0.5)*hammingDist + (0.5)*euclDist

    return dist


In [None]:
def getNeighbors(trainingSet, testInstance, k):
    ''' Find the distance between each testInstance and each instance of the trainingSet. Store each distance value and trainingSet instance in list distances. Distances will be of form [[trainingSet instance, dist]]. Return neighbours, a list of the k trainingSet instances with the shortest distances to the test Instance.
    '''
    distances = []
    
    for x in range(len(trainingSet)):
        dist = distance(testInstance, trainingSet.ix[[x]])
        distances.append((trainingSet.ix[[x]], dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []

    for x in range(k):
        neighbors.append(distances[x][0])
    
    return neighbors

In [None]:
def getResponse(neighbors):
    '''Returns the average last usage metric from the k nearest neighbours as the expected last usage date for a given charity.'''
    
    expectedLastUse = []
    for x in range(len(neighbors)):
        #response = neighbors[x].ix[1,'lastUsage']
        response = neighbors[x].reset_index()
        response = response.ix[0,'lastUsage']
        expectedLastUse.append(response)
    expectedLastUse = sum(expectedLastUse)/len(expectedLastUse)
    
    return expectedLastUse

In [None]:
def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet.ix[x,'lastUsage'] <= predictions[x]*(1.33):
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [None]:
def main():
    trainingSet, testSet = train_test_split(df,0.33)
    
    print 'Train set: ' + repr(len(trainingSet))
    print 'Test set: ' + repr(len(testSet))

    # generate predictions
    predictions = []
    k = 10
    
    for x in range(len(testSet)):
        neighbors = getNeighbors(trainingSet, testSet.ix[[x]], k)
        for neighbor in neighbors:
            neighbor = neighbor.reset_index()
        result = getResponse(neighbors)
        predictions.append(result)
        print('> predicted=' + repr(result) + ', actual=' + repr(testSet.ix[x,'lastUsage']))
    accuracy = getAccuracy(testSet, predictions)
    print('Accuracy: ' + repr(accuracy) + '%')

In [None]:
main()