In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce
from pyspark.sql.functions import col
from pyspark.sql import SQLContext
import math

In [2]:
def process_data(dataset):
    projects = dataset[(dataset['state'] == 'failed') | (dataset['state'] == 'successful')]
    
    # deleting unnecesary rows
    projects.drop('ID',axis=1,inplace=True)
    projects.drop('name',axis=1,inplace=True)
    projects.drop('category',axis=1,inplace=True)
    projects.drop('currency',axis=1,inplace=True)
    projects.drop('goal',axis=1,inplace=True)
    projects.drop('pledged',axis=1,inplace=True)
    projects.drop('usd pledged',axis=1,inplace=True)
    # finding length of the project 
    projects['launched'] = pd.to_datetime(projects['launched']).dt.to_period("M")
    projects['deadline'] = pd.to_datetime(projects['deadline']).dt.to_period("M")
    #Creating a new columns with Campaign total months
    projects['total_months'] = projects['deadline']- projects['launched']
    projects['total_months'] = projects['total_months'].astype(int)
    projects.drop('launched',axis=1,inplace=True)
    projects.drop('deadline',axis=1,inplace=True)
    state = projects.pop('state')
    projects['state']=state
    projects['state']= LabelEncoder().fit_transform(projects['state'])
    #onehotencoder = ce.OneHotEncoder(cols = ['main_category','country'])
    #projects = onehotencoder.fit_transform(projects)
    projects.drop('main_category',axis=1,inplace=True)
    projects.drop('country',axis=1,inplace=True)

    return projects


In [3]:
def getAccuracy(testSet, predictions):
    correct = 0
    i = 0
    for x in testSet.collect():
        if x[-1] == int(predictions[i]):
            correct = correct + 1
        i = i + 1
    return (correct/float(len(predictions))) * 100.0

In [5]:
def euclideanDistance(instance1, instance2, length):
    distance = 0
    for x in range(length):
        distance += pow((instance1[x] - instance2[x]), 2)
    return math.sqrt(distance)

In [15]:
import operator 
def getNeighbors(trainingSet, testInstance, k):
    distances = []
    length = len(testInstance)-1
    for x in trainingSet.collect():
        dist = euclideanDistance(testInstance,x, length)
        distances.append((x, dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors

In [18]:
import operator
def getResponse(neighbors):
    classVotes = {}
    for x in range(len(neighbors)):
        response = neighbors[x][-1]
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
    sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
    return sortedVotes[0][0]

In [None]:
sqlContext = SQLContext(sc)
data = pd.read_csv('ks-projects-201801.csv')
dataset = process_data(data)
dataset = dataset[0:100]
print(dataset.shape)
dataRdd = sqlContext.createDataFrame(dataset)
trainRdd, testRdd = dataRdd.randomSplit([0.8, 0.2])

In [20]:
predictions=list()
for test in testRdd.collect():
    output = getNeighbors(trainRdd,test,3)
    predictions.append(getResponse(output))
accuracy = getAccuracy(testRdd, predictions)
print(accuracy)

90.0
