## 1. Base Line (mean prediction)
## 2. Data Cleaning & preparation
## 3. Linear Regression Model
## 4. Feature Engg.
## 5. Tunning
## 6. Cross Validation
## 7. Residual Analysis

In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [4]:
data = {
	'StudyHours': [2, 3, 4, 5, 6, 7, 8, 9, 10],
	'Attendance': [60, 65, 70, 72, 75, 80, 85, 88, 90],
	'PrevScore': [40, 45, 50, 55, 60, 65, 70, 75, 80],
	'FinalMarks': [50, 55, 60, 65, 70, 75, 78, 85, 88]
}
df=pd.DataFrame(data)
df

Unnamed: 0,StudyHours,Attendance,PrevScore,FinalMarks
0,2,60,40,50
1,3,65,45,55
2,4,70,50,60
3,5,72,55,65
4,6,75,60,70
5,7,80,65,75
6,8,85,70,78
7,9,88,75,85
8,10,90,80,88


In [5]:
y=df['FinalMarks']
y

0    50
1    55
2    60
3    65
4    70
5    75
6    78
7    85
8    88
Name: FinalMarks, dtype: int64

In [6]:
y_pred_baseline=np.repeat(y.mean(),len(y))

In [7]:
mae=mean_absolute_error(y,y_pred_baseline)
print(mae)

10.716049382716049


In [8]:
mse=mean_squared_error(y,y_pred_baseline)

In [9]:
print(mse)

154.02469135802468


In [10]:
rmse=np.sqrt(mse)

In [11]:
print(rmse)

12.410668449282847


In [12]:
r2 = r2_score(y,y_pred_baseline)
print(f"Baeline MAE={mae:.2f},RMSE={rmse:.2f},R-Squared={r2:.2f}")

Baeline MAE=10.72,RMSE=12.41,R-Squared=0.00


In [13]:
#clean the data
print(df.isnull)

<bound method DataFrame.isnull of    StudyHours  Attendance  PrevScore  FinalMarks
0           2          60         40          50
1           3          65         45          55
2           4          70         50          60
3           5          72         55          65
4           6          75         60          70
5           7          80         65          75
6           8          85         70          78
7           9          88         75          85
8          10          90         80          88>


In [14]:
df=df.fillna(df.mean())

In [15]:
df

Unnamed: 0,StudyHours,Attendance,PrevScore,FinalMarks
0,2,60,40,50
1,3,65,45,55
2,4,70,50,60
3,5,72,55,65
4,6,75,60,70
5,7,80,65,75
6,8,85,70,78
7,9,88,75,85
8,10,90,80,88


In [21]:
#Data prep, find input, output
X = df[['StudyHours','Attendance','PrevScore']]
y=df['FinalMarks']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [22]:
#Model creation
model=LinearRegression()

In [23]:
model.fit(X_train,y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [29]:
print('Original Final Marks')
print(y)
y_all_prediction=model.predict(X)
print('Predicted all Final Marks')
print(y_all_prediction)

Original Final Marks
0    50
1    55
2    60
3    65
4    70
5    75
6    78
7    85
8    88
Name: FinalMarks, dtype: int64
Predicted all Final Marks
[50.65945946 55.16756757 59.67567568 64.62162162 69.42162162 73.92972973
 78.43783784 83.23783784 88.18378378]


In [24]:
y_pred=model.predict(X_test)
print(y_pred)

[83.23783784 55.16756757 73.92972973]


In [25]:
mae=mean_absolute_error(y_test,y_pred)
print(mae)

0.9999999999999858


In [26]:
mse=mean_squared_error(y_test,y_pred)
print(mse)

1.4262576089602703


In [27]:
rmse=np.sqrt(mse)
print(rmse)

1.1942602768912103


In [28]:
r2=r2_score(y_test,y_pred)
print(r2)

0.9908312010852554


In [30]:
print(f"Baeline MAE={mae:.2f},RMSE={rmse:.2f},R-Squared={r2:.2f}")

Baeline MAE=1.00,RMSE=1.19,R-Squared=0.99


# Base Line (mean prediction)
2. Data Cleaning & preparation
3. Linear Regression Model
4. Find out MAE, RMSE, R-squared
5. Data Set as follows

data = {'Area': [850, 900, 1000, 1100, 1200, 1500, 16000, 1800, 2000],
    'Bedrooms': [1, 2, 2, 2, 3, 3, 2, 4, 4],
   'Age': [1, 1, 3, 2, 1, 2, 2, 1, 2],
   'Priceln100K': [5, 6, 7, 7, 70, 9, 9, 11, None]
}

In [62]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [63]:
data = {
    'Area': [850, 900, 1000, 1100, 1200, 1500, 1600, 1800, 2000],
    'Bedrooms': [1, 2, 2, 2, 3, 3, 2, 4, 4],
    'Age': [1, 1, 3, 2, 1, 2, 2, 1, 2],
    'PriceIn100K': [5, 6, 7, 7, 7, 9, 9, 11, None]
}

df = pd.DataFrame(data)
print("Original Data:")
print(df)

Original Data:
   Area  Bedrooms  Age  PriceIn100K
0   850         1    1          5.0
1   900         2    1          6.0
2  1000         2    3          7.0
3  1100         2    2          7.0
4  1200         3    1          7.0
5  1500         3    2          9.0
6  1600         2    2          9.0
7  1800         4    1         11.0
8  2000         4    2          NaN


In [64]:
df['PriceIn100K'].fillna(df['PriceIn100K'].mean(), inplace=True)

print("\nAfter fillna():")
print(df)


After fillna():
   Area  Bedrooms  Age  PriceIn100K
0   850         1    1        5.000
1   900         2    1        6.000
2  1000         2    3        7.000
3  1100         2    2        7.000
4  1200         3    1        7.000
5  1500         3    2        9.000
6  1600         2    2        9.000
7  1800         4    1       11.000
8  2000         4    2        7.625


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['PriceIn100K'].fillna(df['PriceIn100K'].mean(), inplace=True)


In [54]:
X = df[['Area', 'Bedrooms', 'Age']]
y = df['PriceIn100K']

In [55]:
model = LinearRegression()
model.fit(X, y)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [56]:
y_pred = model.predict(X)

In [57]:
mae = mean_absolute_error(y, y_pred)
print(mae)

0.20717694336305414


In [58]:
mse = mean_squared_error(y, y_pred)
print(mse)

0.0695926995688391


In [45]:
rmse = np.sqrt(mse)
print(rmse)

0.2638042826961668


In [46]:
r2 = r2_score(y, y_pred)
print(r2)

0.9784834165584265


In [48]:
print(f"Baseline MAE={mae:.2f},RMSE={rmse:.2f},R-Squared={r2:.2f}")

Baseline MAE=0.21,RMSE=0.26,R-Squared=0.98
