In [1]:
# This exercise focuses on splitting a dataset into training and testing sets.
# The dataset used is USA_Housing.csv, which contains both numerical features 
# (e.g., average income, house age) and a target column related to house price.
# The goal is to load the dataset using pandas and then split it into two parts:
# one part (train set) to train a machine learning model, and one part (test set)
# to evaluate how well the model performs on unseen data.
# This step is essential for assessing a model's generalization ability.

import pandas as pd
dataset = 'https://raw.githubusercontent.com/TrainingByPackt/Data-Science-with-Python/refs/heads/master/Chapter01/Data/USA_Housing.csv'
df = pd.read_csv(dataset, header=0)
df

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address
0,79545.458574,5.682861,7.009188,4.09,23086.800503,1.059034e+06,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701..."
1,79248.642455,6.002900,6.730821,3.09,40173.072174,1.505891e+06,"188 Johnson Views Suite 079\nLake Kathleen, CA..."
2,61287.067179,5.865890,8.512727,5.13,36882.159400,1.058988e+06,"9127 Elizabeth Stravenue\nDanieltown, WI 06482..."
3,63345.240046,7.188236,5.586729,3.26,34310.242831,1.260617e+06,USS Barnett\nFPO AP 44820
4,59982.197226,5.040555,7.839388,4.23,26354.109472,6.309435e+05,USNS Raymond\nFPO AE 09386
5,80175.754159,4.988408,6.104512,4.04,26748.428425,1.068138e+06,"06039 Jennifer Islands Apt. 443\nTracyport, KS..."
6,64698.463428,6.025336,8.147760,3.41,60828.249085,1.502056e+06,"4759 Daniel Shoals Suite 442\nNguyenburgh, CO ..."
7,78394.339278,6.989780,6.620478,2.42,36516.358972,1.573937e+06,"972 Joyce Viaduct\nLake William, TN 17778-6483"
8,59927.660813,5.362126,6.393121,2.30,29387.396003,7.988695e+05,USS Gilbert\nFPO AA 20957
9,81885.927184,4.423672,8.167688,6.10,40149.965749,1.545155e+06,Unit 9446 Box 0958\nDPO AE 97025


In [2]:
X = df.drop('Price', axis=1)
X

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Address
0,79545.458574,5.682861,7.009188,4.09,23086.800503,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701..."
1,79248.642455,6.002900,6.730821,3.09,40173.072174,"188 Johnson Views Suite 079\nLake Kathleen, CA..."
2,61287.067179,5.865890,8.512727,5.13,36882.159400,"9127 Elizabeth Stravenue\nDanieltown, WI 06482..."
3,63345.240046,7.188236,5.586729,3.26,34310.242831,USS Barnett\nFPO AP 44820
4,59982.197226,5.040555,7.839388,4.23,26354.109472,USNS Raymond\nFPO AE 09386
5,80175.754159,4.988408,6.104512,4.04,26748.428425,"06039 Jennifer Islands Apt. 443\nTracyport, KS..."
6,64698.463428,6.025336,8.147760,3.41,60828.249085,"4759 Daniel Shoals Suite 442\nNguyenburgh, CO ..."
7,78394.339278,6.989780,6.620478,2.42,36516.358972,"972 Joyce Viaduct\nLake William, TN 17778-6483"
8,59927.660813,5.362126,6.393121,2.30,29387.396003,USS Gilbert\nFPO AA 20957
9,81885.927184,4.423672,8.167688,6.10,40149.965749,Unit 9446 Box 0958\nDPO AE 97025


In [3]:
X.shape

(5000, 6)

In [4]:
Y = df['Price']
Y

0       1.059034e+06
1       1.505891e+06
2       1.058988e+06
3       1.260617e+06
4       6.309435e+05
5       1.068138e+06
6       1.502056e+06
7       1.573937e+06
8       7.988695e+05
9       1.545155e+06
10      1.707046e+06
11      6.637324e+05
12      1.042814e+06
13      1.291332e+06
14      1.402818e+06
15      1.306675e+06
16      1.556787e+06
17      5.284852e+05
18      1.019426e+06
19      1.030591e+06
20      2.146925e+06
21      9.292476e+05
22      7.188872e+05
23      7.439998e+05
24      8.957371e+05
25      1.453975e+06
26      1.125693e+06
27      9.754295e+05
28      1.240764e+06
29      1.577018e+06
            ...     
4970    1.120943e+06
4971    1.111307e+06
4972    1.736402e+06
4973    1.340770e+06
4974    8.013486e+05
4975    1.324382e+06
4976    1.340344e+06
4977    1.518478e+06
4978    1.910585e+06
4979    1.823498e+06
4980    1.406865e+06
4981    1.203850e+06
4982    1.020096e+06
4983    1.194357e+06
4984    1.211900e+06
4985    1.378938e+06
4986    1.260

In [5]:
Y.shape

(5000,)

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [7]:
print("X_train :" ,X_train.shape)
print("X_test :" ,X_test.shape)
print("Y_train :" ,Y_train.shape)
print("Y_test :" ,Y_test.shape)

X_train : (4000, 6)
X_test : (1000, 6)
Y_train : (4000,)
Y_test : (1000,)
