In [42]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

In [31]:
car_uncleaned_df=pd.read_csv('/content/cars.csv')
car_uncleaned_df

Unnamed: 0.1,Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
5,Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1
6,Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
7,Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
8,Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
9,Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4


In [32]:
car_uncleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  32 non-null     object 
 1   mpg         32 non-null     float64
 2   cyl         32 non-null     int64  
 3   disp        32 non-null     float64
 4   hp          32 non-null     int64  
 5   drat        32 non-null     float64
 6   wt          32 non-null     float64
 7   qsec        32 non-null     float64
 8   vs          32 non-null     int64  
 9   am          32 non-null     int64  
 10  gear        32 non-null     int64  
 11  carb        32 non-null     int64  
dtypes: float64(5), int64(6), object(1)
memory usage: 3.1+ KB


In [33]:
# the data is clean
car_uncleaned_df.columns

Index(['Unnamed: 0', 'mpg', 'cyl', 'disp', 'hp', 'drat', 'wt', 'qsec', 'vs',
       'am', 'gear', 'carb'],
      dtype='object')

In [34]:
car_cleaned_df = car_uncleaned_df.rename(columns={'Unnamed: 0': 'car_name'})

In [35]:
car_cleaned_df.isnull().sum()


Unnamed: 0,0
car_name,0
mpg,0
cyl,0
disp,0
hp,0
drat,0
wt,0
qsec,0
vs,0
am,0


In [36]:
car_cleaned_df.isnull().sum()


Unnamed: 0,0
car_name,0
mpg,0
cyl,0
disp,0
hp,0
drat,0
wt,0
qsec,0
vs,0
am,0


In [37]:
car_cleaned_df.columns

Index(['car_name', 'mpg', 'cyl', 'disp', 'hp', 'drat', 'wt', 'qsec', 'vs',
       'am', 'gear', 'carb'],
      dtype='object')

In [44]:
# used kfold

# Cleaned the data
df_clean = car_cleaned_df.copy()
df_clean.rename(columns={'Unnamed: 0': 'car_name'}, inplace=True)
df_clean.dropna(inplace=True)




In [45]:
# Prepare features and target
X = df_clean.drop(columns=['car_name', 'mpg'])
y = df_clean['mpg']

# K-Fold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
r2_scores_kfold = []

#K-Fold Loop
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]


In [46]:

#Scaling
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

#Train model
    model = LinearRegression()
    model.fit(X_train_scaled, y_train)

#Evaluate
    y_pred = model.predict(X_test_scaled)
    r2 = r2_score(y_test, y_pred)
    r2_scores_kfold.append(r2)

#Results
r2_scores_kfold, np.mean(r2_scores_kfold)

([0.6358049706410402], np.float64(0.6358049706410402))

In [47]:
# used train test split

df_clean = car_cleaned_df.copy()
df_clean.dropna(inplace=True)

# Prepare features and target
X = df_clean.drop(columns=['car_name', 'mpg'])
y = df_clean['mpg']

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)


In [48]:

# Scaling features
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predict
y_pred = model.predict(X_test_scaled)

# Calculate r2 score
r2 = r2_score(y_test, y_pred)

print("Predicted values:", y_pred[:5])
print("R² Score:", r2)


Predicted values: [19.83092709 10.98714165 16.3140211  27.14590717 28.61321199]
R² Score: 0.7466453084791008


In [49]:
# used stratified k fold

df_clean = car_cleaned_df.copy()
df_clean.dropna(inplace=True)

# Prepare features and target
X = df_clean.drop(columns=['car_name', 'mpg'])
y = df_clean['mpg']

# Bin continuous target for stratification
y_binned = pd.qcut(y, q=5, labels=False)

# StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

r2_scores = []



In [50]:
for train_index, test_index in skf.split(X, y_binned):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Scale features
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train linear regression model
    model = LinearRegression()
    model.fit(X_train_scaled, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test_scaled)
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)

print("R² scores per fold:", r2_scores)
print("Average R² score:", sum(r2_scores) / len(r2_scores))

R² scores per fold: [0.7958716639537351, 0.6873958440049661, 0.6120066436827524, 0.5208810682301784, 0.7838753889022345]
Average R² score: 0.6800061217547733
