## A. Encoding Categoricals

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

### Reading data

In [2]:
data = pd.read_csv('regression_data.csv')
data.head()

Unnamed: 0,AVGGIFT,HV1_log,IC1_transformed,IC5_transformed,gender,TARGET_D
0,15.5,7.760467,17.343389,4.181353,Male,21.0
1,3.08,6.20859,16.230984,4.150313,Male,3.0
2,7.5,7.113956,18.047227,4.205057,Female,20.0
3,6.7,5.783825,11.73711,4.055333,Male,5.0
4,8.785714,6.64379,12.494862,4.088969,Female,10.0


In [3]:
# how can we see the different categories we find in gender?

data["gender"].unique()


array(['Male', 'Female', 'U'], dtype=object)

In [4]:
data["gender"].value_counts()

Female    2664
Male      1895
U          111
Name: gender, dtype: int64

### Splitting data (X-y split)

In [5]:
# watch out when is the best time for you to do the splits
y = data['TARGET_D']
X = data.drop(['TARGET_D'], axis=1)
X.head()

Unnamed: 0,AVGGIFT,HV1_log,IC1_transformed,IC5_transformed,gender
0,15.5,7.760467,17.343389,4.181353,Male
1,3.08,6.20859,16.230984,4.150313,Male
2,7.5,7.113956,18.047227,4.205057,Female
3,6.7,5.783825,11.73711,4.055333,Male
4,8.785714,6.64379,12.494862,4.088969,Female


### Encoding Categoricals

In [6]:
# Tip to take numericals and categories separately
X_num = X.select_dtypes(include = np.number)
X_cat = X.select_dtypes(include = 'object')

#### Option 1: Encoding with Get_dummies

In [17]:
X_cat = pd.get_dummies(X_cat, drop_first=False) # You can put X, not separating variables is needed.
X_cat

Unnamed: 0,gender_Female,gender_Male,gender_U
0,0,1,0
1,0,1,0
2,1,0,0
3,0,1,0
4,1,0,0
...,...,...,...
4665,0,1,0
4666,0,1,0
4667,1,0,0
4668,0,1,0


#### Option 2: Encoding with OneHotEncoder

In [8]:
# in case you need to use the encode somewhere else besides your notebook:
X_cat = X.select_dtypes(include = 'object') # Apply only to categorical
encoder = OneHotEncoder(handle_unknown='error', drop='first')
encoder.fit(X_cat)

# after the .fit()
{'Male': [1,0],
'Female': [0,0],
'U': [0,1]}

encoded = encoder.transform(X_cat).toarray() 
cat_encoded = pd.DataFrame(encoded)
#encoder.categories_
#cat_encoded.columns = encoder.categories_[0][1:] A more complicated way of assigning the column names
cat_encoded.columns = ['Male','U']
cat_encoded

Unnamed: 0,Male,U
0,1.0,0.0
1,1.0,0.0
2,0.0,0.0
3,1.0,0.0
4,0.0,0.0
...,...,...
4665,1.0,0.0
4666,1.0,0.0
4667,0.0,0.0
4668,1.0,0.0


In [9]:
X_num = X.select_dtypes(include = np.number)
X = pd.concat([X_num, cat_encoded], axis=1)
X.head()

Unnamed: 0,AVGGIFT,HV1_log,IC1_transformed,IC5_transformed,Male,U
0,15.5,7.760467,17.343389,4.181353,1.0,0.0
1,3.08,6.20859,16.230984,4.150313,1.0,0.0
2,7.5,7.113956,18.047227,4.205057,0.0,0.0
3,6.7,5.783825,11.73711,4.055333,1.0,0.0
4,8.785714,6.64379,12.494862,4.088969,0.0,0.0


## B. Train-test split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) 
# Before transforming and preprocessing data (remark)

test_size is the size of the test group (out of that data frame), here mean 30% of the data frame = 30% of rows. And the rest are divided into a train group (70%)

In [11]:
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

## C. Predict on test data

##### How much money they will give?

In [13]:
predictions  = model.predict(X_test)
predictions.shape

(1401,)

In [14]:
predictions

array([16.52541726, 12.9363339 , 17.14130604, ..., 17.1243607 ,
       13.02646931, 12.67364727])

## D. Evaluate the model

![pandas](mse.jpeg)

In [15]:
r2_score(y_test, predictions), mean_squared_error(y_test, predictions, squared=False), mean_squared_error(y_test, predictions)

(0.27059449578723616, 11.41472672814834, 130.2959862783041)

#### Same but maybe more beautiful and easy to understand

In [16]:
r2 = r2_score(y_test, predictions)
RMSE = mean_squared_error(y_test, predictions, squared=False)
MSE = mean_squared_error(y_test, predictions)
print("r2 = ", r2)
print("RMSE = ", RMSE)
print("MSE = ", MSE)

r2 =  0.27059449578723616
RMSE =  11.41472672814834
MSE =  130.2959862783041


###### IMPORTANT: to make predictions on the new data, we have to process the data (X features) in the same way.

#### Remember how linear model works in the backend

[[15.5, 7.8, 17.3, 4.1, 0, 1],
 [...],
]

y = (0.7*x1) + (1.3*x2) + ...  + 8.2 = prediction