# Pandas

In [1]:
import pandas as pd

In [52]:
base = pd.read_csv('credit-data.csv')

In [53]:
base.head()

Unnamed: 0,clientid,income,age,loan,default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [54]:
base.describe()

Unnamed: 0,clientid,income,age,loan,default
count,2000.0,2000.0,1997.0,2000.0,2000.0
mean,1000.5,45331.600018,40.807559,4444.369695,0.1415
std,577.494589,14326.327119,13.624469,3045.410024,0.348624
min,1.0,20014.48947,-52.42328,1.37763,0.0
25%,500.75,32796.459717,28.990415,1939.708847,0.0
50%,1000.5,45789.117313,41.317159,3974.719419,0.0
75%,1500.25,57791.281668,52.58704,6432.410625,0.0
max,2000.0,69995.685578,63.971796,13766.051239,1.0


# Tratamento de valores inconsistentes

In [55]:
base.loc[base['age'] < 0]

Unnamed: 0,clientid,income,age,loan,default
15,16,50501.726689,-28.218361,3977.287432,0
21,22,32197.620701,-52.42328,4244.057136,0
26,27,63287.038908,-36.496976,9595.286289,0


### Método 1 - Apagando valores negativos na variável AGE

In [18]:
# apagar a coluna inteira
base.drop('age', 1, inplace=True)

In [20]:
# apagar somente os registros com problemas
base.drop(base[base.age < 0].index, inplace=True)

In [21]:
base.loc[base['age'] < 0]

Unnamed: 0,clientid,income,age,loan,default


### Método 2 (RECOMANDADO) - Substituindo os valores pela média

In [56]:
base['age'].mean()

40.80755937840458

In [57]:
base['age'][base.age > 0].mean()

40.92770044906149

In [58]:
# Substituindo os valores negativos pela média dos valores acima de 0
base.loc[base.age < 0, 'age'] = 40.92

In [59]:
base.loc[base['age'] < 0]

Unnamed: 0,clientid,income,age,loan,default


In [72]:
base.head(30)

Unnamed: 0,clientid,income,age,loan,default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1
5,6,24904.06414,57.471607,15.498598,0
6,7,48430.359613,26.809132,5722.581981,0
7,8,24500.141984,32.897548,2971.00331,1
8,9,40654.892537,55.496853,4755.82528,0
9,10,25075.872771,39.776378,1409.230371,0


# Tratamento de valores faltantes

In [61]:
base.loc[pd.isnull(base['age'])]

Unnamed: 0,clientid,income,age,loan,default
28,29,59417.805406,,2082.625938,0
30,31,48528.852796,,6155.78467,0
31,32,23526.302555,,2862.010139,0


# Divisão da base - PREVISORES e CLASSE

In [62]:
previsores = base.iloc[:, 1:4].values

In [63]:
previsores

array([[6.61559251e+04, 5.90170151e+01, 8.10653213e+03],
       [3.44151540e+04, 4.81171531e+01, 6.56474502e+03],
       [5.73171701e+04, 6.31080495e+01, 8.02095330e+03],
       ...,
       [4.43114493e+04, 2.80171669e+01, 5.52278669e+03],
       [4.37560566e+04, 6.39717958e+01, 1.62272260e+03],
       [6.94365796e+04, 5.61526170e+01, 7.37883360e+03]])

In [64]:
classe = base.iloc[:, 4].values

In [65]:
classe

array([0, 0, 0, ..., 1, 0, 0])

In [66]:
from sklearn.preprocessing import Imputer

In [67]:
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(previsores[:, 0:3])



In [68]:
previsores[:,0:3] = imputer.transform(previsores[:,0:3])

In [74]:
previsores[28]

array([5.94178054e+04, 4.09276889e+01, 2.08262594e+03])

# Escalonamento de atributos

In [75]:
from sklearn.preprocessing import StandardScaler

In [76]:
scaler = StandardScaler()

In [77]:
previsores = scaler.fit_transform(previsores)

In [79]:
previsores

array([[ 1.45393393,  1.36538093,  1.20281942],
       [-0.76217555,  0.5426602 ,  0.69642695],
       [ 0.83682073,  1.67417189,  1.17471147],
       ...,
       [-0.07122592, -0.97448519,  0.35420081],
       [-0.11000289,  1.73936739, -0.92675625],
       [ 1.682986  ,  1.14917639,  0.96381038]])