In [1]:
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [10]:
# creating spark session
spark=SparkSession.builder.appName('Practice').getOrCreate()

In [11]:
spark

In [12]:
# importing dataframe in csv
df_spark=spark.read.csv('AirPassengers.csv',header=True,inferSchema=True)
df_spark

DataFrame[Month: string, Passengers: int]

In [13]:
df_spark.show()

+-------+----------+
|  Month|Passengers|
+-------+----------+
|1949-01|       112|
|1949-02|       118|
|1949-03|       132|
|1949-04|       134|
|1949-05|       133|
|1949-06|       135|
|1949-07|       148|
|1949-08|       148|
|1949-09|       136|
|1949-10|       119|
|1949-11|       104|
|1949-12|       118|
|1950-01|       115|
|1950-02|       126|
|1950-03|       141|
|1950-04|       135|
|1950-05|       125|
|1950-06|       149|
|1950-07|       170|
|1950-08|       170|
+-------+----------+
only showing top 20 rows



In [15]:
from pyspark.sql.functions import to_date,month,year

In [16]:
# converting date columns of dattime time
df_spark=df_spark.select(to_date(df_spark.Month, 'yyyy-MM').alias('Date'),'Passengers')

In [18]:
# extracting  month and yer from date
df_spark=df_spark.select('Date','Passengers',month(df_spark.Date).alias('Month'),year(df_spark.Date).alias('Year'))
df_spark.show()

+----------+----------+-----+----+
|      Date|Passengers|Month|Year|
+----------+----------+-----+----+
|1949-01-01|       112|    1|1949|
|1949-02-01|       118|    2|1949|
|1949-03-01|       132|    3|1949|
|1949-04-01|       134|    4|1949|
|1949-05-01|       133|    5|1949|
|1949-06-01|       135|    6|1949|
|1949-07-01|       148|    7|1949|
|1949-08-01|       148|    8|1949|
|1949-09-01|       136|    9|1949|
|1949-10-01|       119|   10|1949|
|1949-11-01|       104|   11|1949|
|1949-12-01|       118|   12|1949|
|1950-01-01|       115|    1|1950|
|1950-02-01|       126|    2|1950|
|1950-03-01|       141|    3|1950|
|1950-04-01|       135|    4|1950|
|1950-05-01|       125|    5|1950|
|1950-06-01|       149|    6|1950|
|1950-07-01|       170|    7|1950|
|1950-08-01|       170|    8|1950|
+----------+----------+-----+----+
only showing top 20 rows



In [19]:
df_spark.printSchema()

root
 |-- Date: date (nullable = true)
 |-- Passengers: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Year: integer (nullable = true)



In [20]:
df_spark.columns

['Date', 'Passengers', 'Month', 'Year']

In [21]:
from pyspark.ml.feature import VectorAssembler

In [23]:
# combining independent features
feature_assembler=VectorAssembler(inputCols=['Month','Year'],outputCol='Independent_Features')

In [24]:
output=feature_assembler.transform(df_spark)

In [25]:
output.show()

+----------+----------+-----+----+--------------------+
|      Date|Passengers|Month|Year|Independent_Features|
+----------+----------+-----+----+--------------------+
|1949-01-01|       112|    1|1949|        [1.0,1949.0]|
|1949-02-01|       118|    2|1949|        [2.0,1949.0]|
|1949-03-01|       132|    3|1949|        [3.0,1949.0]|
|1949-04-01|       134|    4|1949|        [4.0,1949.0]|
|1949-05-01|       133|    5|1949|        [5.0,1949.0]|
|1949-06-01|       135|    6|1949|        [6.0,1949.0]|
|1949-07-01|       148|    7|1949|        [7.0,1949.0]|
|1949-08-01|       148|    8|1949|        [8.0,1949.0]|
|1949-09-01|       136|    9|1949|        [9.0,1949.0]|
|1949-10-01|       119|   10|1949|       [10.0,1949.0]|
|1949-11-01|       104|   11|1949|       [11.0,1949.0]|
|1949-12-01|       118|   12|1949|       [12.0,1949.0]|
|1950-01-01|       115|    1|1950|        [1.0,1950.0]|
|1950-02-01|       126|    2|1950|        [2.0,1950.0]|
|1950-03-01|       141|    3|1950|        [3.0,1

In [26]:
# selecting depedent and independent features
final_data=output.select('Independent_Features','Passengers')

In [27]:
final_data.show()

+--------------------+----------+
|Independent_Features|Passengers|
+--------------------+----------+
|        [1.0,1949.0]|       112|
|        [2.0,1949.0]|       118|
|        [3.0,1949.0]|       132|
|        [4.0,1949.0]|       134|
|        [5.0,1949.0]|       133|
|        [6.0,1949.0]|       135|
|        [7.0,1949.0]|       148|
|        [8.0,1949.0]|       148|
|        [9.0,1949.0]|       136|
|       [10.0,1949.0]|       119|
|       [11.0,1949.0]|       104|
|       [12.0,1949.0]|       118|
|        [1.0,1950.0]|       115|
|        [2.0,1950.0]|       126|
|        [3.0,1950.0]|       141|
|        [4.0,1950.0]|       135|
|        [5.0,1950.0]|       125|
|        [6.0,1950.0]|       149|
|        [7.0,1950.0]|       170|
|        [8.0,1950.0]|       170|
+--------------------+----------+
only showing top 20 rows



In [28]:
from pyspark.ml.regression import LinearRegression

In [31]:
# fitting the model
train_data,test_data=final_data.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol='Independent_Features',labelCol='Passengers')
regressor=regressor.fit(train_data)

In [32]:
regressor.coefficients

DenseVector([2.2044, 31.7139])

In [33]:
regressor.intercept

-61721.015473929896

In [35]:
pred_results=regressor.evaluate(test_data)

In [36]:
pred_results.predictions.show()



+--------------------+----------+------------------+
|Independent_Features|Passengers|        prediction|
+--------------------+----------+------------------+
|        [1.0,1951.0]|       145|155.00683354300418|
|        [1.0,1958.0]|       340|377.00412994196813|
|        [1.0,1960.0]|       417|440.43192891310173|
|        [2.0,1960.0]|       391| 442.6363400453556|
|        [3.0,1949.0]|       132| 95.98785683637107|
|        [3.0,1951.0]|       178|159.41565580750466|
|        [3.0,1952.0]|       193| 191.1295552930751|
|        [3.0,1957.0]|       356|349.69905272090546|
|        [4.0,1956.0]|       313| 320.1895643675889|
|        [4.0,1958.0]|       348| 383.6173633387225|
|        [5.0,1951.0]|       172|163.82447807201243|
|        [5.0,1952.0]|       183| 195.5383775575756|
|        [5.0,1953.0]|       229|227.25227704314602|
|        [5.0,1954.0]|       234| 258.9661765287092|
|        [5.0,1956.0]|       318| 322.3939754998428|
|        [5.0,1960.0]|       472|   449.249573

In [37]:
pred_results.meanAbsoluteError

32.09257910162469

In [38]:
pred_results.meanSquaredError

2026.1934554885713

In [39]:
pred_results.r2

0.8463804449210506