In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName('lrex').getOrCreate()

In [None]:
from pyspark.ml.regression import LinearRegression

#### Get familiar with the attributes

In [None]:
training = spark.read.format('libsvm').load('sample_linear_regression_data.csv')

In [None]:
training.show()

In [None]:
lr = LinearRegression(featuresCol='features', labelCol='label',
                     predictionCol='prediction')

In [None]:
lrModel = lr.fit(training)

In [None]:
lrModel.coefficients

In [None]:
lrModel.intercept

In [None]:
training_summary = lrModel.summary

In [None]:
training_summary.r2

In [None]:
training_summary.rootMeanSquaredError

In [None]:
all_data = spark.read.format('libsvm').load('sample_linear_regression_data.csv')

In [None]:
split_object = all_data.randomSplit([0.7, 0.3])

In [None]:
split_object

In [None]:
train_data, test_data = all_Data.randomSplit([0.7, 0.3])

In [None]:
train_data.show()

In [None]:
test_data.describe().show()

In [None]:
correct_model = lr.fit(train_data)

In [None]:
test_results = correct_model.evaluate(test_data)

In [None]:
test_results.residuals.show()

In [None]:
test_results.rootMeanSquaredError

In [None]:
unlabeled_data = test_data.select('features')

In [None]:
unlabeled_data.show()

In [None]:
predictions = correct_model.transform(unlabeled_data)

In [None]:
predictions.show()

#### A more realistic example

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName('lr_example').getOrCreate()

In [None]:
from pyspark.ml.regression import LinearRegression

In [None]:
data = spark.read.csv('Ecommerce_Customers.csv', inferSchema=True, 
                     header = True)

In [None]:
data.printSchema()

In [None]:
for item in data.head(1)[0]:
    print(item)

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [None]:
data.columns

In [None]:
assembler = VectorAssembler(inputCols = ['Avg Session Length',
                                        'Time on App',
                                        'Time on Website',
                                        'Length of Membership'],
                           outputCol = 'features')

In [None]:
output = assembler.transform(data)

In [None]:
output.printSchema()

In [None]:
output.head(1)

In [None]:
final_data = output.select('features', 'Yearly Amount Spent')

In [None]:
final_data.show()

In [None]:
train_data, test_data = final_data_randomSplit([0.7, 0.3])

In [None]:
train_data.describe().show()

In [None]:
test_data.describe().show()

In [None]:
lr = LinearRegression(labelCol = 'Yearly Amount Spent')

In [None]:
lr_model = lr.fit(train_data)

In [None]:
test_results = lr_model.evaluate(test_data)

In [None]:
test_results.residuals.show()

In [None]:
test_results.rootMeanSquaredError

In [None]:
test_results.r2

In [None]:
final_data.describe().show()

In [None]:
unlabeled_data = test_data.select('features')

In [None]:
unlabeled_data.show()

#### ship curise example

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName('cruise').getOrCreate()

In [None]:
df = spark.read.csv('cruise_ship_info.csv', inferSchema=True, 
                    header=True)

In [None]:
df.printSchema()

In [None]:
for ship in df.head(5)
    print(ship)
    print('\n')

In [None]:
df.groupBy('Cruise_line').count().show()

In [None]:
from pyspark.ml.feature import StringIndexer

In [None]:
indexer = StringIndexer(inputCol='Cruise_line', 
                       outputCol='Curise')
indexed = indexer.fit(df).transform(df)
index.head(1)

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [None]:
indexed.columns

In [None]:
assembler = VectorAssembler(inputCols=['Age',
                                      'Tonnage',
                                      'passengers',
                                      'length',
                                      'cabins',
                                      'passenger_density',
                                      'cruise_cat'], 
                            outputCol='features')

In [None]:
output = assembler.transform(indexed)

In [None]:
output.select('features', 'crew').show()

In [None]:
final_data = output.select(['features', 'crew'])

In [None]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [None]:
train_data.describe().show()

In [None]:
test_data.describe().show()

In [None]:
from pyspark.ml.regression import LinearRegression

In [None]:
ship_lr = LinearRegression(labelCol = 'crew')

In [None]:
trained_ship_model = ship_lr.fit(train_data)

In [None]:
ship_results = train_ship_model.evaluate(test_data)

In [None]:
ship_results.rootMeanSquaredError

In [None]:
train_data.descibe().show()

In [None]:
ship_results.r2

In [None]:
ship_results.meanSquaredError

In [None]:
ship_results.meanAbsoluteError

In [None]:
from pyspark.sql.function import corr

In [None]:
df.describe().show()

In [None]:
df.select(corr('crew', 'passengers')).show()