### Regression Task

1. import nessary libraries

In [6]:
# import all need libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

2. load dataset

In [7]:
# loading dataset using pandas
dataset_dir = "../dataset/Ecommerce Customers.csv"
df = pd.read_csv(dataset_dir)

In [None]:
# inspect top 5 rows
df.head()

3. Exploratory data analysis

In [None]:
# EDA session
sns.jointplot(x="Time on Website", y="Yearly Amount Spent", data=df, alpha=0.5)

In [None]:
# pairplot display all property
sns.pairplot(df, kind="scatter", plot_kws={'alpha': 0.4})

In [None]:
# draw best fit linear regression line on data
sns.lmplot(x='Length of Membership', y='Time on Website', data=df, scatter_kws={'alpha': 0.3})

In [8]:
# define X and Y
X = df[['Avg. Session Length', 'Time on App', 'Time on Website', 'Length of Membership']]
y = df[['Yearly Amount Spent']]

4. Data splitting

In [9]:
# splitting dataset using scikit learn
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

5. Define Model & Training

In [None]:
# Initialize the LinearRegression model
lm = LinearRegression()
# training
lm.fit(X_train, y_train)

In [None]:
# checking coefficient of linear model
lm.coef_

In [None]:
# mapping coefficient to label
coef_mapping = pd.DataFrame(lm.coef_[0], X.columns, columns=['Coef'])
print(coef_mapping)

6. Prediction

In [None]:
# prediction first record of test data
lm.predict([X_test.iloc[0]])

In [12]:
# convert y_test to suitable dataframe
Y_test = pd.DataFrame(y_test.to_numpy(), columns=["prediction"])

In [13]:
# making prediction with all testing dataset
predictions = lm.predict(X_test)

In [14]:
# combine ground true with prediction
predictions_df = pd.concat([pd.DataFrame(predictions, columns=['Actual']), Y_test], axis=1)

In [None]:
# plot prediction with ground true
sns.lmplot(x='Actual', y='prediction', data=predictions_df, scatter_kws={'alpha': 0.3})

7. Evaluating model using the discussed metrics

In [None]:
# discussed metrics using scikit learn

from sklearn.metrics import mean_squared_error, mean_absolute_error

print("mean_squared_error: ", mean_squared_error(y_test.to_numpy(), predictions))
print("mean_absolute_error: ", mean_absolute_error(y_test, predictions))


#### Classification Task

In [25]:
iris_dir = "../dataset/Iris.csv"
iris_df = pd.read_csv(iris_dir)

In [26]:
iris_df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [31]:
# define X and Y
X_Iris = iris_df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]
y_Iris = iris_df[['Species']]

In [32]:
# splitting dataset using scikit learn
X_iris_train, X_iris_test, y_iris_train, y_iris_test = train_test_split(X_Iris, y_Iris, test_size=0.3, random_state=42)

In [33]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
model_iris = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model_iris.fit(X_iris_train, y_iris_train)

  return fit_method(estimator, *args, **kwargs)


In [43]:
X_iris_test.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
73,6.1,2.8,4.7,1.2
18,5.7,3.8,1.7,0.3
118,7.7,2.6,6.9,2.3
78,6.0,2.9,4.5,1.5
76,6.8,2.8,4.8,1.4


In [34]:
# Make predictions
y_pred = model_iris.predict(X_iris_test)

In [35]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Evaluate the model
accuracy = accuracy_score(y_iris_test, y_pred)
report = classification_report(y_iris_test, y_pred)
conf_matrix = confusion_matrix(y_iris_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 1.0
Classification Report:
                  precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        19
Iris-versicolor       1.00      1.00      1.00        13
 Iris-virginica       1.00      1.00      1.00        13

       accuracy                           1.00        45
      macro avg       1.00      1.00      1.00        45
   weighted avg       1.00      1.00      1.00        45

Confusion Matrix:
 [[19  0  0]
 [ 0 13  0]
 [ 0  0 13]]


In [48]:
# Save the model if needed
import joblib
joblib.dump(model_iris, 'random_forest_model.pkl')

['random_forest_model.pkl']

In [1]:
# load the model was trained
import joblib
model = joblib.load('./random_forest_model.pkl')

testing_1 = [[6.1,	2.8,	4.7,	1.2]]
prediction = model.predict(testing_1)
print(prediction)

['Iris-versicolor']


