In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from metric import classificationSummary
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay

In [2]:
new_data = pd.read_csv('salidas.csv')
new_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2488532 entries, 0 to 2488531
Columns: 174 entries, tip_naci to ZIMBABWE
dtypes: int64(174)
memory usage: 3.2 GB


In [3]:
y = new_data['mot_viam']  #variable target
X = new_data.drop(columns=['mot_viam'])  #variables input

In [4]:
X.shape

(2488532, 173)

In [5]:
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)

In [6]:
logit_reg = LogisticRegression(penalty="l2", C=1e42, solver='liblinear')
logit_reg.fit(train_X, train_y)

In [7]:
classificationSummary(valid_y, logit_reg.predict(valid_X))

Confusion Matrix (Accuracy 0.7908)

       Prediction
Actual      0      1      2      3      4      5      6      7
     0 381086      1      9     56  44852      0     13     14
     1   5698      0      2      3   1623      0      1      7
     2  19352      1      0     13   9359      0      1     12
     3   7768      0      4    332    770      0      0      0
     4  65846      1     16     49 405532      0     11     23
     5      0      0      0      0    151      0      0      0
     6   5484      0      0      0   3361      0     69      8
     7  25547      1      7     13  18213      0      0    104


In [None]:
import pickle

output_model_file = 'model_logit_reg.pkl'

with open(output_model_file, 'wb') as f:
    pickle.dump(logit_reg, f)

In [None]:
columns = [
    'tip_naci', 'mes_movi', 'dia_movi', 'via_tran', 'mot_viam',
    'AMER_SAMOA', 'ANGUILLA', 'ARUBA', 'BELARUS', 'BENIN', 'BULGARIA',
    'BURKINA_FASO', 'BURUNDI', 'CAMBODIA', 'CAMEROON', 'CANADA',
    'CAPE_VERDE', 'CAYMAN_IS', 'CENT_AFR_REP', 'CHAD', 'CHILE',
    'CHINA', 'COLOMBIA', 'CONGO', 'COSTA_RICA', 'COTE_DIVOIRE',
    'CROATIA', 'CUBA', 'CYPRUS', 'CZECH_REP', 'DENMARK', 'DJIBOUTI',
    'DOMINICA', 'DOMINICAN_RP', 'D_RP_CONGO', 'EGYPT', 'EL_SALVADOR',
    'EQ_GUINEA', 'ESTONIA', 'ETHIOPIA', 'FALKLAND_IS', 'FIJI', 'FINLAND',
    'FRANCE', 'FR_GUIANA', 'FR_POLYNESIA', 'GABON', 'GAMBIA', 'GEORGIA',
    'GERMANY', 'GHANA', 'GREECE', 'GRENADA', 'GUADELOUPE', 'GUATEMALA',
    'GUINEA', 'GUINEABISSAU', 'GUYANA', 'HAITI', 'HONDURAS', 'HONGKONG_SAR',
    'HUNGARY', 'ICELAND', 'INDIA', 'INDONESIA', 'IRAN', 'IRAQ', 'IRELAND',
    'ISRAEL', 'ITALY', 'JAMAICA', 'JAPAN', 'JORDAN', 'KAZAKHSTAN', 'KENYA',
    'KIRIBATI', 'KOREA_NORTE', 'KOREA_SUR', 'KUWAIT', 'KYRGYZTAN', 'LATVIA',
    'LEBANON', 'LESOTHO', 'LIBERIA', 'LIBYA', 'LIECHTENSTEN', 'LITHUANIA',
    'LUXEMBOURG', 'MADAGASCAR', 'MALASIA', 'MALAWI', 'MALDIVES', 'MALI',
    'MALTA', 'MARSHALL_IS', 'MARTINIQUE', 'MAURITANIA', 'MAURITIUS',
    'MEXICO', 'MICRONESIA', 'MONACO', 'MONGOLIA', 'MOROCCO', 'MOZAMBIQUE',
    'MYANMAR', 'NAMIBIA', 'NEPAL', 'NETHERLANDS', 'NEW_ZEALAND', 'NICARAGUA',
    'NIGER', 'NIGERIA', 'NORWAY', 'OMAN', 'PAKISTAN', 'PANAMA', 'PAPUA_NGUIN',
    'PARAGUAY', 'PERU', 'PHILIPPINES', 'POLAND', 'PORTUGAL', 'PUERTO_RICO',
    'QATAR', 'REP_MOLDOVA', 'ROMANIA', 'RUSSIAN_FED', 'RWANDA', 'SAMOA',
    'SAN_MARINO', 'SAUDI_ARABIA', 'SENEGAL', 'SEYCHELLES', 'SIERRA_LEONA',
    'SINGAPORE', 'SLOVAKIA', 'SLOVENIA', 'SOMALIA', 'SOUTH_AFRICA', 'SPAIN',
    'SRI_LANKA', 'ST_KITTS_NEV', 'ST_LUCIA', 'ST_VINCENT_G', 'SUDAN',
    'SURINAME', 'SWAZILAND', 'SWEDEN', 'SWITZERLAND', 'SYRIA', 'TAIWAN',
    'TAJIKISTAN', 'TANZANIA', 'TFYROM', 'THAILAND', 'TOGO', 'TRINIDAD_TBG',
    'TUNISIA', 'TURKEY', 'TURKS_CAICOS', 'UGANDA', 'UK', 'UKRAINE',
    'UNTD_ARAB_EM', 'URUGUAY', 'USA', 'USVIRGIN_IS', 'UZBEKISTAN', 'VANUATU',
    'VENEZUELA', 'VIET_NAM', 'YEMEN', 'ZAMBIA', 'ZIMBABWE'
]

mi_diccionario = {}

info = {
    "tip_naci": 1,
    "mes_movi": 7,
    "dia_movi": 20,
    "via_tran": 1,
    "USA": 1
}

for columna in columns:
    if columna == "mot_viam":
        continue
    if columna in info:
        mi_diccionario[columna] = info[columna]
    else:
        mi_diccionario[columna] = 0

df = pd.DataFrame([mi_diccionario])

input_data = df.to_numpy()

logit_reg.predict(input_data)

In [None]:
mi_diccionario = {}

info = {
    "tip_naci": 1,
    "mes_movi": 2,
    "dia_movi": 3,
    "via_tran": 1,
    "USA": 1
}

for columna in columns:
    if columna == "mot_viam":
        continue
    if columna in info:
        mi_diccionario[columna] = info[columna]
    else:
        mi_diccionario[columna] = 0

df = pd.DataFrame([mi_diccionario])

input_data = df.to_numpy()

logit_reg.predict(input_data)

In [None]:
mi_diccionario = {}

info = {
    "tip_naci": 1,
    "mes_movi": 12,
    "dia_movi": 3,
    "via_tran": 3,
    "PERU": 1
}

for columna in columns:
    if columna == "mot_viam":
        continue
    if columna in info:
        mi_diccionario[columna] = info[columna]
    else:
        mi_diccionario[columna] = 0

df = pd.DataFrame([mi_diccionario])

input_data = df.to_numpy()

logit_reg.predict(input_data)