In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce
from pyspark.sql.functions import col
from pyspark.sql import SQLContext
import math

In [2]:
def process_data(dataset):
    projects = dataset[(dataset['state'] == 'failed') | (dataset['state'] == 'successful')]
    
    # deleting unnecesary rows
    projects.drop('ID',axis=1,inplace=True)
    projects.drop('name',axis=1,inplace=True)
    projects.drop('category',axis=1,inplace=True)
    projects.drop('currency',axis=1,inplace=True)
    projects.drop('goal',axis=1,inplace=True)
    projects.drop('pledged',axis=1,inplace=True)
    projects.drop('usd pledged',axis=1,inplace=True)
    # finding length of the project 
    projects['launched'] = pd.to_datetime(projects['launched']).dt.to_period("M")
    projects['deadline'] = pd.to_datetime(projects['deadline']).dt.to_period("M")
    #Creating a new columns with Campaign total months
    projects['total_months'] = projects['deadline']- projects['launched']
    projects['total_months'] = projects['total_months'].astype(int)
    projects.drop('launched',axis=1,inplace=True)
    projects.drop('deadline',axis=1,inplace=True)
    state = projects.pop('state')
    projects['state']=state
    projects['state']= LabelEncoder().fit_transform(projects['state'])
    #onehotencoder = ce.OneHotEncoder(cols = ['main_category','country'])
    #projects = onehotencoder.fit_transform(projects)
    projects.drop('main_category',axis=1,inplace=True)
    projects.drop('country',axis=1,inplace=True)

    return projects


In [3]:
def mean(numbers):
    return sum(numbers)/float(len(numbers))

In [4]:
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)

In [5]:
def splitDataByClass(inputRdd):
    x_train_success   = inputRdd.filter(col("state").isin([1]))
    cols = list(set(x_train_success.columns) - {'state'})
    x_train_success = x_train_success.select(cols)
    x_train_failure = inputRdd.filter(col("state").isin([0]))
    cols = list(set(x_train_failure.columns) - {'state'})
    x_train_failure = x_train_failure.select(cols)
    class_dict = {'1':x_train_success, '0':x_train_failure}
    return class_dict

In [6]:
def summarize(inputRdd):
    names = inputRdd.schema.names
    summaries_list = list()
    for name in names:
        summaries_list.append(
            (mean(inputRdd.select(name).rdd.flatMap(list).collect()),
             stdev(inputRdd.select(name).rdd.flatMap(list).collect())))
    outputRdd = sc.parallelize(summaries_list)
    return outputRdd

In [7]:
def summarizeByClass(inputRdd):
    class_dict = splitDataByClass(inputRdd)
    summaries = {}
    for k,v in class_dict.items():
        summaries[k] = summarize(v)
    return summaries

In [8]:
def calculateProbability(x, mean, stdev):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

In [9]:
def calculateClassProbabilities(summaries, testSample):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        i = 0
        for mean, stdev in classSummaries.collect():
            probabilities[classValue] *= calculateProbability(testSample[i], mean, stdev)
            i = i + 1
    return probabilities

In [10]:
def predict(summaries, testSample):
    probabilities = calculateClassProbabilities(summaries, testSample)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

In [11]:
def getPredictions(summaries, testRdd):
    predictions = []
    for test in testRdd.collect():
        result = predict(summaries, test)
        predictions.append(result)
    return predictions

In [12]:
def getAccuracy(testSet, predictions):
    correct = 0
    i = 0
    for x in testSet.collect():
        if x[-1] == int(predictions[i]):
            correct = correct + 1
        i = i + 1
    return (correct/float(len(predictions))) * 100.0

In [18]:
sqlContext = SQLContext(sc)
data = pd.read_csv('ks-projects-201801.csv')
dataset = process_data(data)
dataset = dataset[0:10000]
print(dataset.shape)
dataRdd = sqlContext.createDataFrame(dataset)
trainRdd, testRdd = dataRdd.randomSplit([0.8, 0.2])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A

(10000, 5)


In [19]:
summaries = summarizeByClass(trainRdd)

In [21]:
for k,v in summaries.items():
    print(k)
    print(v.collect())

1
[(267.8160637645616, 1488.466654579961), (25783.579644389934, 207164.11841707584), (9594.878044144692, 24124.129509457936), (1.0505824647455548, 0.5065210736714557)]
0
[(17.1019847972973, 94.43493454200193), (1359.1441195101372, 6598.745068070568), (39756.49819890201, 339653.07885709795), (1.1547719594594594, 0.5314903342880606)]


In [22]:
# test model
cols = list(set(testRdd.columns) - {'state'})
XtestRdd= testRdd.select(cols)
predictions = getPredictions(summaries, XtestRdd)

In [23]:
print(predictions)
accuracy = getAccuracy(testRdd, predictions)
print('Accuracy',accuracy)

['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',

Accuracy 69.13086913086913
