# Primer Modelo (Hola Mundo)

## 1) Cargar el dataset.

In [2]:
import pandas as pd

census_df = pd.read_csv('data/census_test.csv')

census_df.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.


## 2) Exploración y análisis (básico)

In [4]:
print(census_df.shape)
print(census_df.dtypes)

(16281, 15)
age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
income            object
dtype: object


## 3) Preprocesamiento (básico).

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country',
 'income']

In [22]:
from sklearn.preprocessing import LabelEncoder

def get_cols_by_type(df, type_name):
    types_df = df.dtypes.to_frame(name = 'dtypes')
    categorical = types_df[types_df['dtypes'] == type_name]
    return list(categorical.index)

def encode_category(df, category):
    lbl_encoder = LabelEncoder()
    return lbl_encoder.fit_transform(df[category])

def encode_categories(df, categories):
    for category in categories:
        df[category] = encode_category(df, category)
        
encode_categories(census_df, get_cols_by_type(census_df, 'object'))

census_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,4,226802,1,7,4,7,3,2,1,0,0,40,38,0
1,38,4,89814,11,9,2,5,0,4,1,0,0,50,38,0
2,28,2,336951,7,12,2,11,0,4,1,0,0,40,38,1
3,44,4,160323,15,10,2,7,0,2,1,7688,0,40,38,1
4,18,0,103497,15,10,4,0,3,4,0,0,0,30,38,0


## 4) Dividir el dataset para entrenamiento y pruebas

In [26]:
from sklearn.model_selection import train_test_split

features = census_df.drop(['income'], axis = 1)
label = census_df[['income']]

display(features.head(2))
display(label.head(2))

X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.20, random_state=300)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,25,4,226802,1,7,4,7,3,2,1,0,0,40,38
1,38,4,89814,11,9,2,5,0,4,1,0,0,50,38


Unnamed: 0,income
0,0
1,0


## 5) Crear modelo

In [33]:
from sklearn.linear_model import LogisticRegression
import warnings

warnings.filterwarnings('ignore')
clf = LogisticRegression()
clf = clf.fit(X_train, y_train)

prediction = clf.predict([
    [24, 3, 20000, 2, 6,  5,  6, 2, 0, 0, 0, 0, 39, 38],
    [27, 3, 30000, 6, 11, 5, 10, 1, 3, 1, 1, 1, 41, 38]
])
print(prediction)

[0 0]


## 6) Evaluar modelo (Exactitud)

In [34]:
from sklearn.metrics import accuracy_score

results = accuracy_score(y_test, clf.predict(X_test))

print(results)

0.7970525023027326


## 7) Evaluar modelo con el dataset general

In [43]:
census_full_df = pd.read_csv('data/census.csv')
print(census_full_df.shape)
encode_categories(census_full_df, get_cols_by_type(census_full_df, 'object'))
features_full = census_full_df.drop(['income'], axis = 1)
label_full = census_full_df[['income']]

display(census_full_df.head(3))

X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(features_full, label_full, test_size=0.20, random_state=300)
clf_full = LogisticRegression()
clf_full = clf_full.fit(X_train_full, y_train_full)
results_full = accuracy_score(y_test_full, clf_full.predict(X_test_full))

print(results_full)


(32561, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0


0.7795178873023184
