In [1]:
#import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

In [2]:
#generate data
np.random.seed(0)
num_samples = 1000
X = np.random.rand(num_samples, 2) * 10
y = np.random.rand(num_samples) * 20 - 10

In [3]:
print(X[0:5])

[[5.48813504 7.15189366]
 [6.02763376 5.44883183]
 [4.23654799 6.45894113]
 [4.37587211 8.91773001]
 [9.63662761 3.83441519]]


In [4]:
print(y[0:5])

[ 6.23036941 -0.47832028  0.4631198  -4.98958827  2.10086034]


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
      test_size=0.2, random_state=0)

In [6]:
scaler = StandardScaler()
regression = LinearRegression()

In [7]:
pipeline = Pipeline(steps=[('scaler',
scaler), ('regression',
      regression)])

In [8]:
pipeline.fit(X_train, y_train)

In [9]:
coefficients = pipeline.named_steps['regression'].coef_
intercept = pipeline.named_steps['regression'].intercept_
print("Coefficients:", coefficients)
print("Intercept:", intercept)

Coefficients: [ 0.15546559 -0.00932743]
Intercept: 0.09379541976188044


In [10]:
y_pred = pipeline.predict(X_test)

In [11]:
import pandas as pd
data = {'Actual': y_test[:5], 'Predicted':
y_pred[:5]}
df = pd.DataFrame(data)
print(df.head())

     Actual  Predicted
0  7.593631   0.031606
1  8.618826  -0.070879
2  6.433439   0.021079
3  7.068987   0.116030
4 -1.042438   0.185484


In [12]:
rmse = np.sqrt(mean_squared_error(y_test,
y_pred))
print("Root Mean Squared Error (RMSE) on test data:", rmse)

Root Mean Squared Error (RMSE) on test data: 5.547618269160591


In [13]:
# Step 1: Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

# Step 2: Create a SparkSession
spark = SparkSession.builder.getOrCreate()

# Step 3: Generate simulated data
num_samples = 1000
simulated_data = spark.range(num_samples).selectExpr(
    "id as id",
    "(RAND() * 10) as feature1",
    "(RAND() * 5) as feature2",
    "(RAND() * 20 - 10) as label"
)

# Step 4: Split the data into training and testing sets
train_ratio = 0.8
test_ratio = 1 - train_ratio
training_data, testing_data = simulated_data.randomSplit([train_ratio, test_ratio], seed=42)

# Step 5: Define the pipeline stages
assembler = VectorAssembler(inputCols=["feature1", "feature2"], outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
regression = LinearRegression(featuresCol="scaledFeatures", labelCol="label")

# Step 6: Create the pipeline
pipeline_stages = [assembler, scaler, regression]
pipeline = Pipeline(stages=pipeline_stages)

# Step 7: Fit the pipeline to the training data
pipeline_model = pipeline.fit(training_data)

# Step 8: Print the coefficients
coefficients = pipeline_model.stages[-1].coefficients
intercept = pipeline_model.stages[-1].intercept
print("Coefficients:", coefficients)
print("Intercept:", intercept)

# Step 9: Make predictions on the testing data
predictions = pipeline_model.transform(testing_data)

# Step 10: Evaluate the model
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data:", rmse)

# Step 11: Display top five actual vs. predicted
predictions.select("label", "prediction").show(5)


Coefficients: [0.03728169914728992,-0.12030905007664384]
Intercept: -0.5073217174734503
Root Mean Squared Error (RMSE) on test data: 5.645031670498643
+-------------------+--------------------+
|              label|          prediction|
+-------------------+--------------------+
| 0.5870884296808772| -0.6773930782772895|
|-1.5116476177150382| -0.6510817635236585|
|   4.97742406802943|-0.47378774989898714|
|  4.793709911751325| -0.5328694373524987|
| 7.6492319066567624| -0.4094695469197426|
+-------------------+--------------------+
only showing top 5 rows

