Estudo dirigido 5 - Regressão linear

Importando bibliotecas

In [1]:
import numpy as np
import pandas as pd
from os import path, getcwd, sep, listdir
from copy import copy
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error

Carregando os dados dos candidatos

In [2]:
cur_dir  = getcwd()
data_dir = path.join(cur_dir, 'data')
csv_file = path.join(data_dir, 'candidatos.csv')
df       = pd.read_csv(csv_file)

In [3]:
df

Unnamed: 0,experience,test_score,interview_score,salary
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000
8,one,6.0,6,40000


Transformando literais da coluna `experience` em `int`

In [4]:
def number_as_literal_to_int(literal):

    if type(literal) == float: # nan
        return literal
    
    conversion_dict = dict()
    conversion_dict['one']    = 1
    conversion_dict['two']    = 2
    conversion_dict['three']  = 3
    conversion_dict['five']   = 5
    conversion_dict['seven']  = 7
    conversion_dict['ten']    = 10
    conversion_dict['eleven'] = 11

    return conversion_dict[literal]

df2 = copy(df)

for i in range(df.index.start, df.index.stop):
    
    row = df.iloc[i]
    exp = row[0]
    
    df2.at[i, 'experience'] = number_as_literal_to_int(exp)

In [5]:
df2

Unnamed: 0,experience,test_score,interview_score,salary
0,,8.0,9,50000
1,,8.0,6,45000
2,5.0,6.0,7,60000
3,2.0,10.0,10,65000
4,7.0,9.0,6,70000
5,3.0,7.0,10,62000
6,10.0,,7,72000
7,11.0,7.0,8,80000
8,1.0,6.0,6,40000


Dividindo o conjunto de dados em teste e treino

As linhas que possuem `NaN` não farão parte de nenhum dos conjuntos

In [6]:
df3 = df2.dropna()
df3

Unnamed: 0,experience,test_score,interview_score,salary
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
7,11,7.0,8,80000
8,1,6.0,6,40000


Criando um dataframe com as fileiras que possuem `NaN` para testar o regressor posteriormente

In [48]:
df_na = pd.DataFrame(index=[0,1,6], columns=df.columns)

for index in df_na.index:
    for column in df_na.columns:

        df_na.at[index, column] = df2.at[index, column]

df_na

Unnamed: 0,experience,test_score,interview_score,salary
0,,8.0,9,50000
1,,8.0,6,45000
6,10.0,,7,72000


Os labels foram definidos com base nas colunas que possuem `NaN`

#### Regressor para `experience`

In [26]:
X_experience = df3.iloc[:, 1:]
X_experience

Unnamed: 0,test_score,interview_score,salary
2,6.0,7,60000
3,10.0,10,65000
4,9.0,6,70000
5,7.0,10,62000
7,7.0,8,80000
8,6.0,6,40000


In [27]:
Y_experience = df3.iloc[:, 0]
Y_experience

2     5
3     2
4     7
5     3
7    11
8     1
Name: experience, dtype: object

In [28]:
X_experience_train, X_experience_test, Y_experience_train, Y_experience_test = train_test_split(X_experience, Y_experience)

In [37]:
experience_model = linear_model.LinearRegression()
experience_model = experience_model.fit(X_experience_train, Y_experience_train)

In [38]:
Y_experience_pred = experience_model.predict(X_experience_test)

In [41]:
MAE  = mean_absolute_error(Y_experience_test, Y_experience_pred)
print(f"Mean absolute error: {MAE}")

Mean absolute error: 1.722222222224128


In [50]:
X_experience_na = df_na.iloc[:2, 1:]
X_experience_na

Unnamed: 0,test_score,interview_score,salary
0,8.0,9,50000
1,8.0,6,45000


In [53]:
predictions = experience_model.predict(X_experience_na)

i = 0
for p in predictions:
    print(f"{i}: {p}")
    i += 1

0: -1.6444444444450692
1: -1.7333333333347145


#### Regressor para `test_score`

In [55]:
X_score = df3.iloc[:, [0,2,3]]
X_score

Unnamed: 0,experience,interview_score,salary
2,5,7,60000
3,2,10,65000
4,7,6,70000
5,3,10,62000
7,11,8,80000
8,1,6,40000


In [56]:
Y_score = df3.iloc[:, 1]
Y_score

2     6.0
3    10.0
4     9.0
5     7.0
7     7.0
8     6.0
Name: test_score, dtype: float64

In [57]:
X_score_train, X_score_test, Y_score_train, Y_score_test = train_test_split(X_score, Y_score)

In [58]:
test_score_model = linear_model.LinearRegression()
test_score_model = test_score_model.fit(X_score_train, Y_score_train)

In [61]:
Y_score_pred = test_score_model.predict(X_score_test)

print(f"Predicoes do treino")
i = 0
for p in Y_score_pred:
    print(f"{i}: {p}")
    i += 1

Predicoes do treino
0: 1.2500000000114753
1: 5.89999999999781


In [62]:
MAE  = mean_absolute_error(Y_score_test, Y_score_pred)
print(f"Mean absolute error: {MAE}")

Mean absolute error: 3.4249999999953573


In [68]:
X_score_na = df_na.iloc[:, [0,2,3]]
X_score_na = X_score_na.dropna()
X_score_na

Unnamed: 0,experience,interview_score,salary
6,10,7,72000


In [69]:
predictions = test_score_model.predict(X_score_na)

for p in predictions:
    print(f"{6}: {p}")

6: 1.0250000000109836
