<h1>Import and install of spark</h1>

In [1]:
#### This cell is to make spark work on a windows laptop
import os
import sys

# Path for spark source folder
os.environ['SPARK_HOME']="C:\spark-2.0.1-bin-hadoop2.7"

# Append pyspark  to Python Path
sys.path.append("C:\spark-2.0.1-bin-hadoop2.7\python")
sys.path.append("C:\spark-2.0.1-bin-hadoop2.7\python\lib\py4j-0.10.3-src.zip")
#os.environ['SPARK_EXECUTOR_MEMORY']="5G"

try:
    from pyspark import SparkContext
    from pyspark import SparkConf
    from pyspark.sql import SQLContext
    print ("Successfully imported Spark Modules")

except ImportError as e:
    print ("Can not import Spark Modules", e)
    sys.exit(1)

# Initialize SparkContext
sc = SparkContext('local')
words = sc.parallelize(["scala","java","hadoop","spark","akka"])
print (words.count())
print(words.countByValue())

Successfully imported Spark Modules
5
defaultdict(<class 'int'>, {'scala': 1, 'java': 1, 'spark': 1, 'hadoop': 1, 'akka': 1})


In [2]:
from pyspark.sql import types
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import numpy as np

<h1>Import data and take off header</h1>

In [3]:
input_path = 'train.csv'
raw_data = sc.textFile(input_path)

print("number of rows before cleaning header:",raw_data.count())

header = raw_data.first()

cleaned_data = raw_data.filter(lambda row : row != header)

print("number of rows without header:",cleaned_data.count())
print('Number of partitions :'+str(cleaned_data.getNumPartitions()))

sqlContext = SQLContext(sc)

number of rows before cleaning header: 188319
number of rows without header: 188318
Number of partitions :2


In [4]:
names = header.split(';')[0].split(',')
print(names)
print("Length of names",len(names))

['id', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cat19', 'cat20', 'cat21', 'cat22', 'cat23', 'cat24', 'cat25', 'cat26', 'cat27', 'cat28', 'cat29', 'cat30', 'cat31', 'cat32', 'cat33', 'cat34', 'cat35', 'cat36', 'cat37', 'cat38', 'cat39', 'cat40', 'cat41', 'cat42', 'cat43', 'cat44', 'cat45', 'cat46', 'cat47', 'cat48', 'cat49', 'cat50', 'cat51', 'cat52', 'cat53', 'cat54', 'cat55', 'cat56', 'cat57', 'cat58', 'cat59', 'cat60', 'cat61', 'cat62', 'cat63', 'cat64', 'cat65', 'cat66', 'cat67', 'cat68', 'cat69', 'cat70', 'cat71', 'cat72', 'cat73', 'cat74', 'cat75', 'cat76', 'cat77', 'cat78', 'cat79', 'cat80', 'cat81', 'cat82', 'cat83', 'cat84', 'cat85', 'cat86', 'cat87', 'cat88', 'cat89', 'cat90', 'cat91', 'cat92', 'cat93', 'cat94', 'cat95', 'cat96', 'cat97', 'cat98', 'cat99', 'cat100', 'cat101', 'cat102', 'cat103', 'cat104', 'cat105', 'cat106', 'cat107', 'cat108', 'cat109', 'cat110', '

In [109]:
cats = names[1:117]
conts = names[117:-1]

In [6]:
def create_StructField(string):
    hint = string[:3]
    if hint == "cat":
        datatype = types.IntegerType()
    elif hint == "con":
        datatype = types.FloatType()
    elif hint == "id":
        datatype = types.IntegerType()
    elif hint == "los":
        datatype = types.FloatType()
    else:
        raise ValueError("Can\'t read this string:" + hint )

    return types.StructField(string, datatype, False)

In [7]:
structField_list = [create_StructField(string) for string in names]

In [8]:
data_schema = types.StructType(structField_list)

In [9]:
def tryeval(val,column_number):
    if column_number == 0:
        return int(val)
    elif 1 <= column_number <= 116:
        return val
    elif 117 <= column_number <= 131:
        return float(val)
    else:
        raise Exception("There is a big problem")

def to_tuple(row):
    list_strings = row.split(',')
    return tuple(tryeval(val, n) for n, val in enumerate(list_strings))

cleaned_data_splitted = cleaned_data.map(lambda x:to_tuple(x))

    

In [108]:
def to_tuples(list_):
    return tuple((string,) for string in list_)

def fusion(x, y):
    return tuple(tuple(set(xi + yi)) for xi, yi in zip(x,y))

list_of_dictionaries = []
a = cleaned_data_splitted.map(lambda x: to_tuples(x[1:117]))
a = a.reduce(fusion)

In [107]:
sorted_tuples = tuple(tuple(sorted(tup)) for tup in a)

In [13]:
for tup in sorted_tuples:
    my_dict = dict()
    for idx, cat in enumerate(tup):
        my_dict[cat] = idx
    list_of_dictionaries.append(my_dict)

In [15]:
bListOfDictionaries = sc.broadcast(list_of_dictionaries)

In [16]:
def replace(row):
    strings = row[1:117]
    my_dicts = bListOfDictionaries.value
    tuple_of_ints = ()
    for dict_, string in zip(my_dicts, strings):
        try:
            tuple_of_ints += (dict_[string],)
        except KeyError:
            tuple_of_ints += (0,)
    return (row[0],) + tuple_of_ints + row[117:]

In [17]:
final_rdd = cleaned_data_splitted.map(lambda row:replace(row))

In [236]:
df = sqlContext.createDataFrame(final_rdd, schema = data_schema).coalesce(12).cache()

In [21]:
def plot_frequencies(column_idx, max_cat = 10):
    name = names[column_idx]
    a = df.groupBy(name).count().collect()
    pdf = pd.DataFrame(data=a).sort_values(0)
    l1 = pdf[0].tolist()[:10]
    l2 = pdf[1].tolist()[:10]
    plt.bar(range(len(l1)), l2)
    plt.xticks(np.arange(len(l1))+0.5,l1)
    plt.title("Frequencies of " + name)
    plt.show()

In [23]:
from pyspark.mllib.stat import KernelDensity

def plotDistribution(rdd, plot=True, numSamples=1000):
    rdd.cache()
    vmin = rdd.min()
    vmax = rdd.max()
    
    if vmin==vmax:
        return None, None
    
    stddev = rdd.stdev()
    
    domain = np.arange(vmin, vmax, (vmax-vmin)/numSamples)
    
    # a simple heuristic to select bandwidth
    bandwidth = 1.06 * stddev * pow(rdd.count(), -.2)
    
    
    kd = KernelDensity()
    kd.setSample(rdd)
    kd.setBandwidth(bandwidth)
    density = kd.estimate(domain)
    
    rdd.unpersist()
    
    # plot
    if(plot):
        plt.plot(domain, density)
        plt.xlim(0,20000)
        plt.show()
    else:
        return domain,density

In [24]:
def densityEstimation(cat_number,max_cat=5):
    
    cat_loss = df.select([cats[cat_number-1],"loss"]).rdd.cache()
    
    for integer in range(len(sorted_tuples[cat_number-1])):
        
        #Selecting the losses belonging to this category
        my_rdd = cat_loss.filter(lambda x: x[0] == integer)
        
        samples = my_rdd.map(lambda x: x[1])
        domain, density = plotDistribution(samples, False)
        
        # This is not done if the category has only one entry.
        if domain is not None and density is not None:
        
            plt.plot(domain,density, label="category: " + str(integer))

            if integer%max_cat == max_cat-1:
                plt.xlim(0,20000)
                plt.legend(loc='upper right')
                plt.title("Categorical feature N° " + str(cat_number))
                plt.show()
            
    if integer%max_cat !=max_cat-1:
        plt.xlim(0,20000)
        plt.legend(loc='upper right')
        plt.title("Categorical feature N° " + str(cat_number))
        plt.show()
    cat_loss.unpersist() 

In [26]:
def sample_and_plot(cont_idx):
    symplified_rdd = df.select([conts[cont_idx],"loss"]).rdd
    sample = symplified_rdd.takeSample(False, 2000)
    continuous_sample = []
    loss_sample = []
    for row in sample:
        continuous_sample.append(row[0])
        loss_sample.append(row[1])
    plt.scatter(continuous_sample, loss_sample, s = 0.07)
    plt.title("Continuous feature N°" + str(cont_idx+1))
    plt.show()

<h1>PART OF MAKING A PREDICTION WITH MultilayerPerceptronClassifier</h1>

In [333]:
from pyspark.ml.linalg import Vectors, VectorUDT

In [334]:
def keep_index(tup):
    result = ()
    for idx in list_indices:
        result += (tup[idx-1],)
        
    return result

In [373]:
list_indices = [i for i in range(2,last)]
last = len(names)
L = df.rdd.count()
print(last)
print(L)

132
188318


In [374]:
structField_list_selected = [create_StructField(string) for i,string in enumerate(names) if i in list_indices]

In [389]:
mean = df.rdd.map(lambda x: (x[-1])).mean()
stdev = df.rdd.map(lambda x: (x[-1])).stdev()

In [391]:
print(stdev)

2904.07847613


In [399]:
df_Perceptron = sqlContext.createDataFrame(df.rdd.map(lambda x: (float(x[-1]-mean), Vectors.dense(keep_index(x)))), ["label", "features"])

In [394]:
df_Perceptron.select("label").show()

+--------------------+
|               label|
+--------------------+
|-0.28379321034954846|
| -0.6038878509372658|
|-0.01110424455208...|
| -0.7222558643451996|
|-0.09417362171162356|
|  0.7250260104909019|
| -0.6560145431486457|
| 0.18884211257377148|
|  2.4940312630537584|
|   1.083735231032165|
|  1.1568256298113546|
|  1.0083723009046492|
| -0.6350681125941565|
| -0.6768300795292171|
| -0.8443841008324754|
| -0.5653730599268415|
|  1.2299881590987147|
|-0.13038137138117617|
| 0.38910179184875254|
|  0.2618910657346301|
+--------------------+
only showing top 20 rows



In [383]:
df_Perceptron.select("features").first()

Row(features=DenseVector([0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 3.0, 1.0, 1.0, 3.0, 3.0, 1.0, 3.0, 2.0, 1.0, 3.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 1.0, 2.0, 4.0, 0.0, 2.0, 15.0, 1.0, 6.0, 0.0, 0.0, 8.0, 4.0, 6.0, 9.0, 6.0, 45.0, 28.0, 2.0, 19.0, 55.0, 0.0, 14.0, 269.0, 0.7263, 0.2459, 0.1876, 0.7896, 0.3101, 0.7184, 0.3351, 0.3026, 0.6714, 0.8351, 0.5697, 0.5946, 0.8225, 0.7148]))

In [400]:
(df_Perceptron_train,df_Perceptron_test) = df_Perceptron.randomSplit([0.7, 0.3])

In [None]:
df_Perceptron_train.select("label").show()


In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator


featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=32).fit(df_Perceptron)

# Train a DecisionTree model.
dt = DecisionTreeRegressor(featuresCol="indexedFeatures")

# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, dt])

# Train model.  This also runs the indexer.
model = pipeline.fit(df_Perceptron_train)

# Make predictions.
predictions = model.transform(df_Perceptron_test)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(100)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="mae")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

treeModel = model.stages[1]
# summary only
print(treeModel)

<h1>PART OF MAKING A PREDICTION WITH ?</h1>

In [330]:
from pyspark.ml.regression import GeneralizedLinearRegression


glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3)

# Fit the model
model = glr.fit(df_Perceptron)

# Print the coefficients and intercept for generalized linear regression model
print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))

# Summarize the model over the training set and print out some metrics
summary = model.summary
print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors))
print("T Values: " + str(summary.tValues))
print("P Values: " + str(summary.pValues))
print("Dispersion: " + str(summary.dispersion))
print("Null Deviance: " + str(summary.nullDeviance))
print("Residual Degree Of Freedom Null: " + str(summary.residualDegreeOfFreedomNull))
print("Deviance: " + str(summary.deviance))
print("Residual Degree Of Freedom: " + str(summary.residualDegreeOfFreedom))
print("AIC: " + str(summary.aic))
print("Deviance Residuals: ")
summary.residuals().show()

Coefficients: [1.35335746922e-05,-1051.51414254,1227.4934644,1698.16241385,990.503116633,1047.74607516,364.047070749,4416.08406751,526.944026828]
Intercept: 1747.4076279010137
Coefficient Standard Errors: [3.5212580562094656e-05, 14.106974375468429, 14.000582331838448, 26.783596031267724, 13.773924484722937, 13.671555523887191, 15.292001256525488, 39.08956700536803, 25.729132029377983, 18.654263194171712]
T Values: [0.38433919003359335, -74.53860158447762, 87.67445776954186, 63.40307746073159, 71.91146704279849, 76.63693230312296, 23.80637201386897, 112.97347107749498, 20.48044318891395, 93.67336622799282]
P Values: [0.7007274989021783, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Dispersion: 6695589.685371516
Null Deviance: 1588212205186.7375
Residual Degree Of Freedom Null: 188317
Deviance: 1260833102472.9395
Residual Degree Of Freedom: 188308
AIC: 3494221.733282857
Deviance Residuals: 
+-------------------+
|  devianceResiduals|
+-------------------+
|-1752.2242908288758|
|-1691.301

In [None]:
from pyspark.ml.regression import GeneralizedLinearRegression

# Load training data
glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3)

# Fit the model
model = glr.fit(df_Perceptron_train)
predictions = model.transform(df_Perceptron_test)

# Print the coefficients and intercept for generalized linear regression model
print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))

# Summarize the model over the training set and print out some metrics
summary = model.summary
print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors))
print("T Values: " + str(summary.tValues))
print("P Values: " + str(summary.pValues))
print("Dispersion: " + str(summary.dispersion))
print("Null Deviance: " + str(summary.nullDeviance))
print("Residual Degree Of Freedom Null: " + str(summary.residualDegreeOfFreedomNull))
print("Deviance: " + str(summary.deviance))
print("Residual Degree Of Freedom: " + str(summary.residualDegreeOfFreedom))
print("AIC: " + str(summary.aic))
print("Deviance Residuals: ")
summary.residuals().show()