# TASK 2 
### NAME: YEO ZHENG XU ISAAC

In [1]:
from pyspark.mllib.regression import LabeledPoint
import numpy as np
from pyspark.sql import Row
from pyspark.sql import functions as sql_functions
from pyspark.sql.types import *
import matplotlib.pyplot as plt
import matplotlib.cm as cm

#### Read & Load in DataFile

In [2]:
musicsongsdata = sqlContext.read.load('/FileStore/tables/YearPredictionMSD.txt', 'text')
numbr_points = musicsongsdata.count()

In [3]:
att_descr = "90 attributes, 12 = timbre average, 78 = timbre covariance. \nThe first value is the year (target), ranging from 1922 to 2011. \nFeatures extracted from the 'timbre' features from The Echo Nest API. \nWe take the average and covariance over all 'segments', each segment being described by a 12-dimensional timbre vector."

df = musicsongsdata.rdd.map(lambda row: str(row['value']).split(",")).map(lambda row: LabeledPoint(row[0], [float(x) for x in row[1:]])).toDF(["Features", "Year"])

In [4]:
print (att_descr)
print ('\nNumber of data points: ', numbr_points, "\n")

In [5]:
musicsongsdata.take(1)

### prepare data before testing

In [6]:
year_data = df.select("Year").groupBy("Year").count()
year_data.show()

### Add data into dataframe

In [7]:
year_data = year_data.toPandas()
year_data

Unnamed: 0,Year,count
0,1988.0,5611
1,1976.0,2179
2,1951.0,74
3,1940.0,52
4,1928.0,52
5,1979.0,3108
6,1953.0,133
7,1987.0,5122
8,1959.0,592
9,1934.0,29


# average year 

In [9]:
avg_year = year_data["Year"]
average_year = sum(avg_year)/len(avg_year)

print("Average year: ", average_year)

## prepare data for training and testing

In [10]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol = 'prediction')

weights = [.8, .1, .1]
seed = 42
parsed_train_data_df, parsed_val_data_df, parsed_test_data_df = df.randomSplit(weights, seed= seed)

parsed_train_data_df.cache()
parsed_val_data_df.cache()
parsed_test_data_df.cache()
n_train = parsed_train_data_df.count()
n_val = parsed_val_data_df.count()
n_test = parsed_test_data_df.count()

print ('Training dataset size: {0}'.format(n_train))
print ('Validation dataset size: {0}'.format(n_val))
print ('Testing dataset size: {0}'.format(n_test))

##  Baseline Regression Model 

In [11]:
preds_and_labels_test = parsed_test_data_df.rdd.map(lambda row: (float(1967), float(row['Year'])))
preds_and_labels_test_df = sqlContext.createDataFrame(preds_and_labels_test, ["prediction", "label"])
rmse_test_base = evaluator.evaluate(preds_and_labels_test_df)

print ('Baseline Model RMSE = {0:.3f}'.format(rmse_test_base))

## Linear regression model 

In [12]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf

In [13]:
# Linear regression model parameter values
num_iters = 500  # iterations
reg = 1e-1  # regParam
alpha = .2  
use_intercept = True  # intercept

parsed_train_data_df = parsed_train_data_df.rdd.map(lambda row: (Vectors.dense(row["Features"]), float(row['Year'])))
parsed_train_data_df = sqlContext.createDataFrame(parsed_train_data_df,["features","label"])
parsed_train_data_df
lin_reg = LinearRegression(maxIter = num_iters, regParam = reg, elasticNetParam = alpha, fitIntercept = use_intercept, labelCol = 'label', featuresCol = 'features')

first_model = lin_reg.fit(parsed_train_data_df)

In [14]:
coeffs_LR1 = first_model.coefficients
intercept_LR1 = first_model.intercept
print (coeffs_LR1, intercept_LR1)

In [15]:
parsed_val_data_df = parsed_val_data_df.rdd.map(lambda row: (Vectors.dense(row["Features"]), float(row['Year'])))
parsed_val_data_df = sqlContext.createDataFrame(parsed_val_data_df,["features","label"])

val_pred_df = first_model.transform(parsed_val_data_df)
rmse_val_LR1 = evaluator.evaluate(val_pred_df)

print ('Validation RMSE:LR1 = ',  rmse_val_LR1)

### RMSE derived from implementing Linear regression is around 9.xx
### Results were greatly improved by implementing Linear regression from the Baseline Regression model which was around 33.216