In [1]:
import findspark
findspark.init("/opt/spark")

In [27]:
import datetime as dt

import pandas as pd
from pyspark.sql import DataFrame, Column, SparkSession
#from pyspark.sql.functions import mean, udf, lit, to_json, expr, col, struct
from pyspark import SparkContext

In [3]:
# read data
df = pd.read_csv("https://raw.githubusercontent.com/erkansirin78/datasets/master/Advertising.csv")
print(df.head())

   ID     TV  Radio  Newspaper  Sales
0   1  230.1   37.8       69.2   22.1
1   2   44.5   39.3       45.1   10.4
2   3   17.2   45.9       69.3    9.3
3   4  151.5   41.3       58.5   18.5
4   5  180.8   10.8       58.4   12.9


In [4]:
# Feature matrix
X = df.iloc[:, 1:-1].values
print(X.shape)
print(X[:3])

(200, 3)
[[230.1  37.8  69.2]
 [ 44.5  39.3  45.1]
 [ 17.2  45.9  69.3]]


In [5]:
# Output variable
y = df.iloc[:, -1]
print(y.shape)
print(y[:6])

(200,)
0    22.1
1    10.4
2     9.3
3    18.5
4    12.9
5     7.2
Name: Sales, dtype: float64


In [6]:
# split test train
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [8]:
# train model
from sklearn.ensemble import RandomForestRegressor

In [9]:
estimator = RandomForestRegressor(n_estimators=200)
estimator.fit(X_train, y_train)

In [10]:
# Test model
y_pred = estimator.predict(X_test)
from sklearn.metrics import r2_score

In [11]:
r2 = r2_score(y_true=y_test, y_pred=y_pred)
print(f"R2: {r2}")

R2: 0.9822621361303673


In [12]:
# Save Model
import joblib
joblib.dump(estimator, "randomforest_with_advertising.pkl")

['randomforest_with_advertising.pkl']

In [13]:
# make predictions
# Read models
estimator_loaded = joblib.load("randomforest_with_advertising.pkl")

In [14]:
# Prediction set
X_manual_test = [[230.1,37.8,69.2]]
print("X_manual_test", X_manual_test)

prediction = estimator_loaded.predict(X_manual_test)
print("prediction", prediction)

X_manual_test [[230.1, 37.8, 69.2]]
prediction [21.998]


In [15]:
def prediction(df:DataFrame, model, spark_context:SparkContext) -> Column:
    """
    Predicts using a given model and a Spark dataframe.

    Parameters
    ----------
    df : pyspark.sql.dataframe.DataFrame
        Spark dataframe containing the data to be predicted
    model : object
        Model object to be used for prediction
    spark_context : Spark Context
        Spark Context instance

    Returns
    -------
    pyspark.sql.column.Column
        Spark dataframe column containing the predicted values
    """
    model = spark_context.broadcast(model)

    @udf('integer')
    def predict_data(*cols):
        return int(model.value.predict((cols,)))

    return predict_data(*df.columns)

In [16]:
spark = SparkSession.builder.appName("Spark ML").master("local[*]").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/07/02 17:06:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [17]:
sc = spark.sparkContext

In [30]:
from pyspark import SparkFiles
github_url="https://raw.githubusercontent.com/erkansirin78/datasets/master/Advertising.csv"
sc.addFile(github_url)
df = spark.read.csv(SparkFiles.get("Advertising.csv"),header= True, inferSchema=True)
df.show(5)

24/07/02 17:48:03 WARN SparkContext: The path https://raw.githubusercontent.com/erkansirin78/datasets/master/Advertising.csv has been added already. Overwriting of added paths is not supported in the current version.


+---+-----+-----+---------+-----+
| ID|   TV|Radio|Newspaper|Sales|
+---+-----+-----+---------+-----+
|  1|230.1| 37.8|     69.2| 22.1|
|  2| 44.5| 39.3|     45.1| 10.4|
|  3| 17.2| 45.9|     69.3|  9.3|
|  4|151.5| 41.3|     58.5| 18.5|
|  5|180.8| 10.8|     58.4| 12.9|
+---+-----+-----+---------+-----+
only showing top 5 rows



In [20]:
pred = prediction(df=df, model=estimator_loaded, spark_context=sc)

In [25]:
df_pred = df.withColumn("PREDICTION", prediction(df=df.select('TV','Radio','Newspaper'), model=estimator_loaded, spark_context=sc))

In [31]:
df_pred.show(5)

[Stage 8:>                                                          (0 + 1) / 1]

+---+-----+-----+---------+-----+----------+
| ID|   TV|Radio|Newspaper|Sales|PREDICTION|
+---+-----+-----+---------+-----+----------+
|  1|230.1| 37.8|     69.2| 22.1|        21|
|  2| 44.5| 39.3|     45.1| 10.4|        10|
|  3| 17.2| 45.9|     69.3|  9.3|         8|
|  4|151.5| 41.3|     58.5| 18.5|        18|
|  5|180.8| 10.8|     58.4| 12.9|        13|
+---+-----+-----+---------+-----+----------+
only showing top 5 rows



                                                                                

In [28]:
spark.version

'3.5.1'

In [29]:
! python -V

Python 3.8.10
