In [128]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression

In [129]:
# Load the dataset
file_path = '../datasets/50_Startups.csv'
dataset = pd.read_csv(file_path)
print(dataset.head())

   R&D Spend  Administration  Marketing Spend       State     Profit
0  165349.20       136897.80        471784.10    New York  192261.83
1  162597.70       151377.59        443898.53  California  191792.06
2  153441.51       101145.55        407934.54     Florida  191050.39
3  144372.41       118671.85        383199.62    New York  182901.99
4  142107.34        91391.77        366168.42     Florida  166187.94


In [130]:
# Separate features (X) and target variable (Y)
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 4].values

In [131]:
print("X: ", X[:5])
print("Y: ", Y[:5])

X:  [[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']
 [144372.41 118671.85 383199.62 'New York']
 [142107.34 91391.77 366168.42 'Florida']]
Y:  [192261.83 191792.06 191050.39 182901.99 166187.94]


In [132]:
# Handle missing values in numerical columns with the mean strategy
imputer = SimpleImputer(strategy="mean",missing_values=0.0)
X[:, 0:3] = imputer.fit_transform(X[:, 0:3])
print(X[:5])

[[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']
 [144372.41 118671.85 383199.62 'New York']
 [142107.34 91391.77 366168.42 'Florida']]


In [133]:
# Encode categorical data using LabelEncoder for the State column
labelencoder = LabelEncoder()
X[:, 3] = labelencoder.fit_transform(X[:, 3])
print(X[:5])

[[165349.2 136897.8 471784.1 2]
 [162597.7 151377.59 443898.53 0]
 [153441.51 101145.55 407934.54 1]
 [144372.41 118671.85 383199.62 2]
 [142107.34 91391.77 366168.42 1]]


In [134]:
# Use OneHotEncoder to create dummy variables for categorical data (State column)
ct = ColumnTransformer([("encoder", OneHotEncoder(), [3])], remainder='passthrough')
X = ct.fit_transform(X)
print(X[:5])

[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [0.0 1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 0.0 1.0 144372.41 118671.85 383199.62]
 [0.0 1.0 0.0 142107.34 91391.77 366168.42]]


In [135]:
X1=X[:,1:]
print(X1[:5])

[[0.0 1.0 165349.2 136897.8 471784.1]
 [0.0 0.0 162597.7 151377.59 443898.53]
 [1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 1.0 144372.41 118671.85 383199.62]
 [1.0 0.0 142107.34 91391.77 366168.42]]


In [136]:
# Split the data into training and testing sets (both encoded and non-encoded versions)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
X1_train, X1_test, Y1_train, Y1_test = train_test_split(X[:, 1:], Y, test_size=0.2, random_state=0)

In [137]:
# Train Linear Regression models on both encoded and non-encoded data
regressor = LinearRegression()
regressor.fit(X_train, Y_train)

regressor1 = LinearRegression()
regressor1.fit(X1_train, Y1_train)

In [138]:
# Predict on the test set for both models
y_pred = regressor.predict(X_test)
y1_pred = regressor1.predict(X1_test)

In [139]:
# Display the predictions
print("Predictions using all features (including encoded categorical data):")
print(y_pred)

print("\nPredictions using non-encoded features only:")
print(y1_pred)

Predictions using all features (including encoded categorical data):
[102388.94113042 121465.72713518 127340.57708621  71709.47538914
 174211.08480003 121771.65061493  68393.54360667  95588.53133488
 116596.34676992 162514.07218554]

Predictions using non-encoded features only:
[102388.94113034 121465.72713518 127340.57708614  71709.47538914
 174211.08479988 121771.65061493  68393.54360679  95588.53133493
 116596.34676983 162514.07218542]


This script loads the dataset, handles missing values, encodes categorical data, splits the data into train and test sets, trains two linear regression models (one on all features, including encoded categorical data, and one on non-encoded features only), and makes predictions on the test set using both models.