# 1. Random Sampling

### 1.1 Training-Test Split

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# load the data set
df = pd.read_csv("./EngineData.csv")
y = df["torque_Nm"]
X = df.drop(columns="torque_Nm")


# Perform a random train-test split with 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the training and test sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)


Training set shape: (800, 9) (800,)
Test set shape: (200, 9) (200,)


In [2]:
print(X_train)
print(y_train)
print(X_test)
print(y_test)

     ambient_temperature_°C  vibration_mm/s²  pressure_bar  cooling_temp_°C  \
29                36.312623         6.494440      7.709784        89.289219   
535               36.660898         3.272072      8.219929        87.869739   
695                4.878800         1.721626      3.837180        87.771469   
557               14.699151         5.819548      2.668934        87.216850   
836               31.299928         7.886340      9.534595        87.816197   
..                      ...              ...           ...              ...   
106               10.802238         6.701610      6.885112        85.343395   
270                0.247804         1.780038      9.423638        85.599645   
860               13.703196         7.000763      6.833755        88.507551   
435               13.305337         3.389404      5.022862        86.356703   
102                2.831011         7.928826      6.282555        89.116010   

     speed_kmph_km/h  stress_N/m²  strain_unit  ang

### 1.2 Cross-Validation

In [3]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

# load the data set
df = pd.read_csv("./EngineData.csv")
y = df["torque_Nm"]
X = df.drop(columns="torque_Nm")


# Create a linear regression model
model = LinearRegression()

# Perform 5-fold cross-validation and calculate the mean R-squared score
cross_val_scores = cross_val_score(model, X, y, cv=5)
mean_r2_score = np.mean(cross_val_scores)

print("Mean R-squared score:", mean_r2_score)


Mean R-squared score: -0.028533428090833014


In [4]:
cross_val_scores

array([-0.02749734, -0.02794567, -0.01166724, -0.03598351, -0.03957338])

In [5]:
X

Unnamed: 0,ambient_temperature_°C,vibration_mm/s²,pressure_bar,cooling_temp_°C,speed_kmph_km/h,stress_N/m²,strain_unit,angular_momentum_kg·m²/s,force_N
0,38.298146,3.390437,2.537838,86.914544,18.447287,73.434898,0.703603,452.811917,1732.658428
1,12.286798,3.281659,4.331681,86.795646,115.554883,32.337016,8.891838,274.482314,1743.588990
2,29.945825,8.838192,4.768807,88.617998,75.001693,6.365340,6.913182,24.672717,135.450555
3,2.766650,6.764898,2.105763,86.098198,13.464832,99.108173,2.409190,996.151659,107.508903
4,20.453724,5.971326,7.964390,86.125763,112.328138,6.833211,2.740506,232.754081,282.217581
...,...,...,...,...,...,...,...,...,...
995,21.391659,7.458175,7.471147,86.759060,82.062176,20.685309,4.706950,278.491432,986.284555
996,16.308644,4.801672,5.710728,82.528047,82.728166,85.054203,0.655439,51.260047,1732.899953
997,18.025370,4.521215,5.073832,87.974213,84.803546,66.314881,5.097082,670.164686,598.134555
998,38.761510,2.879948,9.747712,85.849424,114.747854,39.696458,2.289354,492.963123,1195.408291


# 2. Stratified Random Sampling

In [6]:
import pandas as pd
df = pd.read_csv("./Market_research.csv")
df

Unnamed: 0,Brand,Design,Performance,Camera,Battery,Price
0,LG,9.2,8.6,8.7,7.5,8.3
1,OnePlus,8.3,8.3,7.3,8.6,8.4
2,Xiaomi,8.3,8.1,7.7,7.6,8.5
3,LG,7.3,7.3,8.0,7.8,7.4
4,Google,8.1,9.2,9.3,8.4,7.9
...,...,...,...,...,...,...
1095,Sony,8.3,8.8,8.5,7.6,8.3
1096,Google,9.5,7.2,8.6,7.1,7.8
1097,Google,9.0,8.1,8.7,8.8,7.4
1098,LG,8.5,7.1,7.3,7.3,7.7


In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
X = df.drop(columns="Brand")
y = df["Brand"]
# Split the dataset into training and testing sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Print the shapes of the training and test sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)

Training set shape: (880, 5) (880,)
Test set shape: (220, 5) (220,)


In [8]:
X_train

Unnamed: 0,Design,Performance,Camera,Battery,Price
301,9.5,8.2,9.3,7.9,7.2
346,7.1,8.1,9.2,7.8,7.9
751,9.4,9.4,7.9,9.4,8.5
400,8.5,7.0,8.5,7.3,7.2
1082,8.1,7.4,9.4,8.6,8.4
...,...,...,...,...,...
661,8.4,8.1,9.2,8.3,8.7
654,8.3,8.5,8.5,9.4,8.9
366,7.3,8.6,9.2,7.4,8.1
800,8.3,9.1,7.4,7.2,8.2


In [9]:
X_test

Unnamed: 0,Design,Performance,Camera,Battery,Price
507,8.2,7.2,8.5,9.5,8.3
766,7.6,9.2,8.4,8.3,9.4
744,9.2,9.2,7.9,9.2,7.9
859,7.6,9.1,9.1,7.3,8.8
182,9.3,7.9,7.9,7.2,8.9
...,...,...,...,...,...
550,7.9,7.9,8.1,8.2,7.9
9,7.1,9.1,8.0,8.9,8.9
841,9.2,7.8,8.8,7.6,8.6
453,9.4,8.6,8.2,9.2,8.7


# 3. Train test split

In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# load the data set
df = pd.read_csv("./HousePrices_Bengaluru.csv")

X = df.drop(columns="House Price")
y = df["House Price"]
# Split the dataset into training and testing sets with simple random sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the training and test sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)

Training set shape: (960, 7) (960,)
Test set shape: (240, 7) (240,)


In [11]:
print(X_train, X_test, y_train, y_test)

      Bedrooms  Square Footage  Location  Age of House  Bathrooms  \
331          4            3919     Urban            11          3   
409          3            1101  Suburban            26          2   
76           1            2249  Suburban            40          4   
868          4            1525     Urban            28          2   
138          1            2440     Urban             1          3   
...        ...             ...       ...           ...        ...   
1044         1            3095  Suburban            39          2   
1095         4            1073     Urban            27          4   
1130         3            2323  Suburban            39          3   
860          2            2000     Rural            22          1   
1126         3            1318     Urban            38          2   

      Garage Capacity  Overall Condition  
331                 1                  2  
409                 1                  1  
76                  2                  2  

In [12]:
df

Unnamed: 0,Bedrooms,Square Footage,Location,Age of House,Bathrooms,Garage Capacity,Overall Condition,House Price
0,5,3116,Rural,44,1,3,4,69276313.71
1,1,3773,Suburban,22,3,3,6,81551917.56
2,4,3504,Rural,15,2,1,4,74988800.76
3,4,2788,Suburban,30,2,1,8,61505426.21
4,4,1370,Urban,2,1,1,7,30699096.62
...,...,...,...,...,...,...,...,...
1195,2,1650,Suburban,29,4,1,5,38844934.85
1196,3,2396,Urban,1,1,3,1,52933645.85
1197,5,1063,Urban,47,4,3,6,28795533.40
1198,5,1274,Rural,29,4,2,1,31831129.81


# 4. validation test

In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# load the data set
df = pd.read_csv("./HousePrices_Bengaluru.csv")
X = df.drop(columns=["House Price", "Location"])
y = df["House Price"]

# Split the dataset into training, validation, and testing sets using simple random sampling
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train the linear regression model using the training set
model = LinearRegression()
model.fit(X_train, y_train)

# Perform model selection and hyperparameter tuning using the validation set
param_grid = {'fit_intercept': [True, False], 'positive': [True, False]}
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_val, y_val)

best_model = grid_search.best_estimator_
print("Best hyperparameters:", grid_search.best_params_)

# Perform the final evaluation using the test set
y_test_pred = best_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
print("Testing MSE:", test_mse)


Best hyperparameters: {'fit_intercept': True, 'positive': True}
Testing MSE: 287691766947.1054


In [14]:
print(X_train, X_temp, y_train, y_temp)

      Bedrooms  Square Footage  Age of House  Bathrooms  Garage Capacity  \
522          1            2524            40          2                2   
213          2            3582            33          4                2   
120          1            3202            29          4                2   
917          3            2063            18          4                2   
1067         1            1813            34          1                2   
...        ...             ...           ...        ...              ...   
1044         1            3095            39          2                1   
1095         4            1073            27          4                3   
1130         3            2323            39          3                3   
860          2            2000            22          1                2   
1126         3            1318            38          2                1   

      Overall Condition  
522                   1  
213                  10  
120      

In [15]:
print(X_val, X_test, y_val, y_test)

      Bedrooms  Square Footage  Age of House  Bathrooms  Garage Capacity  \
1157         3            1969            18          4                2   
156          1            2365            46          4                2   
787          1            1772            11          1                2   
644          2            2543            25          3                3   
984          3            2374            32          3                1   
...        ...             ...           ...        ...              ...   
135          4            3398            40          2                3   
1101         3            1766            35          1                3   
590          2            3371            20          4                1   
1049         5            1595            30          2                1   
735          4            1362            49          3                3   

      Overall Condition  
1157                 10  
156                   3  
787      

In [16]:
df

Unnamed: 0,Bedrooms,Square Footage,Location,Age of House,Bathrooms,Garage Capacity,Overall Condition,House Price
0,5,3116,Rural,44,1,3,4,69276313.71
1,1,3773,Suburban,22,3,3,6,81551917.56
2,4,3504,Rural,15,2,1,4,74988800.76
3,4,2788,Suburban,30,2,1,8,61505426.21
4,4,1370,Urban,2,1,1,7,30699096.62
...,...,...,...,...,...,...,...,...
1195,2,1650,Suburban,29,4,1,5,38844934.85
1196,3,2396,Urban,1,1,3,1,52933645.85
1197,5,1063,Urban,47,4,3,6,28795533.40
1198,5,1274,Rural,29,4,2,1,31831129.81


### 5. Time-based Splitting

In [17]:
import pandas as pd
df = pd.read_csv("./automobile_sales_data.csv")
df

Unnamed: 0,Date,Car Model,Units Sold,Revenue
0,2022-01-01,SUV,4,124409.417891
1,2022-01-01,SUV,6,312343.022380
2,2022-01-01,Electric,2,45217.482437
3,2022-01-01,Truck,2,115767.973809
4,2022-01-01,SUV,1,30775.980466
...,...,...,...,...
27909,2022-12-31,SUV,3,230901.393564
27910,2022-12-31,Truck,9,537571.328170
27911,2022-12-31,Electric,10,475471.955502
27912,2022-12-31,Sedan,4,168215.371089


In [18]:
import pandas as pd
from datetime import datetime

# Load the automobile sales data
df = pd.read_csv('automobile_sales_data.csv')

# Convert the 'Date' column to a datetime object
df['Date'] = pd.to_datetime(df['Date'])

# Sort the data by date in chronological order
df.sort_values('Date', inplace=True)
print(df)

# Define the start and end dates for training, validation, and testing sets
start_train_date = datetime(2022, 1, 1)
end_train_date = datetime(2022, 4, 30)

start_val_date = datetime(2022, 5, 1)
end_val_date = datetime(2022, 8, 31)

start_test_date = datetime(2022, 9, 1)
end_test_date = datetime(2022, 12, 31)

# Split the data into training, validation, and testing sets
train_data = df[(df['Date'] >= start_train_date) & (df['Date'] <= end_train_date)]
val_data = df[(df['Date'] >= start_val_date) & (df['Date'] <= end_val_date)]
test_data = df[(df['Date'] >= start_test_date) & (df['Date'] <= end_test_date)]

# Optionally, you can drop the 'Date' column if it's not needed for modeling
train_data.drop(columns=['Date'], inplace=True)
val_data.drop(columns=['Date'], inplace=True)
test_data.drop(columns=['Date'], inplace=True)

# Check the shapes of the split datasets
print("Training set shape:", train_data.shape)
print("Validation set shape:", val_data.shape)
print("Testing set shape:", test_data.shape)


            Date Car Model  Units Sold        Revenue
0     2022-01-01       SUV           4  124409.417891
28    2022-01-01     Sedan           3  174392.574833
29    2022-01-01     Truck           8  380128.268021
30    2022-01-01       SUV          10  420554.389144
31    2022-01-01     Truck           4   90820.596158
...          ...       ...         ...            ...
27882 2022-12-31  Electric           9  566302.801562
27883 2022-12-31     Truck           7  343485.745047
27884 2022-12-31  Electric           8  462049.777440
27873 2022-12-31       SUV           6  177298.036307
27913 2022-12-31  Electric           8  593000.469032

[27914 rows x 4 columns]
Training set shape: (9379, 3)
Validation set shape: (9305, 3)
Testing set shape: (9230, 3)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data.drop(columns=['Date'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_data.drop(columns=['Date'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.drop(columns=['Date'], inplace=True)


In [19]:
import pandas as pd
from datetime import datetime
from sklearn.model_selection import TimeSeriesSplit

# Load the automobile sales data
df = pd.read_csv('automobile_sales_data.csv')

# Convert the 'Date' column to a datetime object
df['Date'] = pd.to_datetime(df['Date'])

# Sort the data by date in chronological order
df.sort_values('Date', inplace=True)

# Define the feature columns and the target column (sales in this case)
X = df.drop(columns='Date')
y = df['Date']

# Define the number of splits for TimeSeriesSplit
n_splits = 3

# Create the TimeSeriesSplit object
tscv = TimeSeriesSplit(n_splits=n_splits)

# Split the data into training, validation, and testing sets
for train_index, test_index in tscv.split(X):
    # Get the indices for training and testing sets
    train_indices, test_indices = train_index, test_index

# Use the obtained indices to create train, validate, and test sets
train_data = df.iloc[train_indices]
test_data = df.iloc[test_indices]

# Split the training data further into training and validation sets
val_size = int(len(train_data) * 0.2)
val_data = train_data.iloc[-val_size:]
train_data = train_data.iloc[:-val_size]

# Check the shapes of the split datasets
print("Training set shape:", train_data.shape)
print("Validation set shape:", val_data.shape)
print("Testing set shape:", test_data.shape)


Training set shape: (16749, 4)
Validation set shape: (4187, 4)
Testing set shape: (6978, 4)


In [1]:
import sys

In [2]:
sys.path

['D:\\IMLAIC\\YouTube\\ML Galaxy\\4_ Supervised Machine Learning Process\\HandsOn\\Simple linear Regression 10 step process examples\\step5_Data splitting',
 'C:\\Users\\Surya Charan Teja\\OneDrive\\Desktop\\modules',
 'C:\\Users\\Surya Charan Teja\\AppData\\Local\\Programs\\Python\\Python310\\python310.zip',
 'C:\\Users\\Surya Charan Teja\\AppData\\Local\\Programs\\Python\\Python310\\DLLs',
 'C:\\Users\\Surya Charan Teja\\AppData\\Local\\Programs\\Python\\Python310\\lib',
 'C:\\Users\\Surya Charan Teja\\AppData\\Local\\Programs\\Python\\Python310',
 '',
 'C:\\Users\\Surya Charan Teja\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages',
 'C:\\Users\\Surya Charan Teja\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\win32',
 'C:\\Users\\Surya Charan Teja\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\win32\\lib',
 'C:\\Users\\Surya Charan Teja\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\Pythonwin']