In [2]:
#Loading libraries
from pyspark import SparkContext         #used for spark context
from pyspark.sql import SQLContext       #used for creating dataframe
import pandas as pd                      #used for loading csv

#libraries used for creating vector and label points
# Convert the data frame to a dense vector
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint

#Mllib library for random forest
from pyspark.mllib.tree import RandomForest,RandomForestModel
from time import *


sc = SparkContext('local','Credit Fraud')  # if using locally
sql_sc = SQLContext(sc)
pandas_df = pd.read_csv('~/cdacproject/Traindata.csv')  # assuming the file contains a header
s_df = sql_sc.createDataFrame(pandas_df)

# The last column contains the classification outcome. Turn this into an RDD
# of Labeled Points.
transformed_df = s_df.rdd.map(lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1])))
print("Number of training set rows: %d" % transformed_df.count())

start_time = time()

# Train our random forest model.
model = RandomForest.trainClassifier(transformed_df, numClasses=2, categoricalFeaturesInfo={}, \
    numTrees=3, featureSubsetStrategy="auto", impurity="gini", \
    maxDepth=4, maxBins=32, seed=1023)

end_time = time()
elapsed_time = end_time - start_time
print("Time to train model: %.3f seconds" % elapsed_time)

#model.save(sc, "~/cdacproject/RFModel")



Number of training set rows: 170885
Time to train model: 7.214 seconds


In [3]:

test_df = pd.read_csv('~/cdacproject/Testdata.csv')
test_data = sql_sc.createDataFrame(test_df)

# The last column contains the classification outcome. Turn this into an RDD
# of Labeled Points.
testdata = test_data.rdd.map(lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1])))
print("Number of test set rows: %d" % testdata.count())

#Loading the saved model
saveModel = RandomForestModel.load(sc,"~/cdacproject/RFModel")

# Make predictions and compute accuracy
predictions = saveModel.predict(testdata.map(lambda x: x.features))
labels_and_predictions = testdata.map(lambda x: x.label).zip(predictions)
acc = labels_and_predictions.filter(lambda x: x[0] == x[1]).count() / float(test_data.count())
print("Model accuracy: %.3f%%" % (acc * 100))

sc.stop() #Stopping the spark context



Number of test set rows: 113922
Model accuracy: 99.926%
