<a href="https://colab.research.google.com/github/virf96/Basico/blob/main/Estabilidad_y_performance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Estabilidad y performance

## Preparación de ambiente

### Carga de módulos

In [3]:
pip install pygal

Collecting pygal
[?25l  Downloading https://files.pythonhosted.org/packages/5f/b7/201c9254ac0d2b8ffa3bb2d528d23a4130876d9ba90bc28e99633f323f17/pygal-2.4.0-py2.py3-none-any.whl (127kB)
[K     |██▋                             | 10kB 11.1MB/s eta 0:00:01[K     |█████▏                          | 20kB 12.2MB/s eta 0:00:01[K     |███████▊                        | 30kB 8.5MB/s eta 0:00:01[K     |██████████▎                     | 40kB 7.3MB/s eta 0:00:01[K     |████████████▉                   | 51kB 5.5MB/s eta 0:00:01[K     |███████████████▍                | 61kB 6.1MB/s eta 0:00:01[K     |██████████████████              | 71kB 6.1MB/s eta 0:00:01[K     |████████████████████▌           | 81kB 6.7MB/s eta 0:00:01[K     |███████████████████████▏        | 92kB 6.1MB/s eta 0:00:01[K     |█████████████████████████▊      | 102kB 6.4MB/s eta 0:00:01[K     |████████████████████████████▎   | 112kB 6.4MB/s eta 0:00:01[K     |██████████████████████████████▉ | 122kB 6.4MB/s eta 0

In [4]:
from sklearn.feature_selection import SelectKBest, f_classif, f_regression
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, r2_score
from datetime import datetime
import pandas as pd
import numpy as np
import pygal

### Funciones relevantes

In [5]:
# Función que regresa el IV de la variable provista
def IV(df, var, tgt):
    aux = df[[var, tgt]].groupby(var).agg(["count", "sum"])
    aux["evento"] = aux[tgt, "sum"]
    aux["no_evento"] = aux[tgt, "count"] - aux[tgt, "sum"]
    aux["%evento"] = aux["evento"] / aux["evento"].sum()
    aux["%no_evento"] = aux["no_evento"] / aux["no_evento"].sum()
    aux["WOE"] = np.log(aux["%no_evento"] / aux["%evento"])
    aux["IV"] = (aux["%no_evento"] - aux["%evento"])*aux["WOE"]
    return aux["IV"].sum()

### Data Wrangling

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
df = pd.read_csv("/content/drive/MyDrive/datasets/OnlineNewsPopularity.csv").sample(frac = 0.1).reset_index(drop = True)

df.columns = [x.strip() for x in df.columns] # Remoción de espacio de las columas (' hola ' -> 'hola')

df["success"] = (df["shares"] > df["shares"].quantile(.5))*1 # Generación de variable objetivo discreta

ls_cont = ['n_tokens_title', 'n_tokens_content', 'n_unique_tokens', 'n_non_stop_words', 
           'n_non_stop_unique_tokens', 'num_hrefs', 'num_self_hrefs', 'num_imgs', 'num_videos', 
           'average_token_length', 'num_keywords', 'kw_min_min', 'kw_max_min', 'kw_avg_min', 
           'kw_min_max', 'kw_max_max', 'kw_avg_max', 'kw_min_avg', 'kw_max_avg', 'kw_avg_avg', 
           'self_reference_min_shares', 'self_reference_max_shares', 'self_reference_avg_sharess', 
           'LDA_00', 'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04', 'global_subjectivity', 
           'global_sentiment_polarity', 'global_rate_positive_words', 'global_rate_negative_words',
           'rate_positive_words', 'rate_negative_words', 'avg_positive_polarity', 'min_positive_polarity',
           'max_positive_polarity', 'avg_negative_polarity', 'min_negative_polarity', 
           'max_negative_polarity', 'title_subjectivity', 'title_sentiment_polarity', 
           'abs_title_subjectivity', 'abs_title_sentiment_polarity']
target = "shares"
target_disc = "success"

In [8]:
df

Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,num_keywords,data_channel_is_lifestyle,data_channel_is_entertainment,data_channel_is_bus,data_channel_is_socmed,data_channel_is_tech,data_channel_is_world,kw_min_min,kw_max_min,kw_avg_min,kw_min_max,kw_max_max,kw_avg_max,kw_min_avg,kw_max_avg,kw_avg_avg,self_reference_min_shares,self_reference_max_shares,self_reference_avg_sharess,weekday_is_monday,weekday_is_tuesday,weekday_is_wednesday,weekday_is_thursday,weekday_is_friday,weekday_is_saturday,weekday_is_sunday,is_weekend,LDA_00,LDA_01,LDA_02,LDA_03,LDA_04,global_subjectivity,global_sentiment_polarity,global_rate_positive_words,global_rate_negative_words,rate_positive_words,rate_negative_words,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares,success
0,http://mashable.com/2013/04/10/nhl-team-trolls...,638.0,8.0,288.0,0.663121,1.0,0.771739,10.0,5.0,1.0,0.0,4.673611,4.0,0.0,1.0,0.0,0.0,0.0,0.0,4.0,1100.0,423.000000,9400.0,617900.0,184200.000000,2170.397306,3575.000000,3006.136878,2000.0,2000.0,2000.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.050003,0.052294,0.050149,0.797554,0.050000,0.429661,0.115565,0.048611,0.010417,0.823529,0.176471,0.280535,0.033333,0.5,-0.230556,-0.400000,-0.125000,0.000000,0.000000,0.500000,0.000000,1400,0
1,http://mashable.com/2013/03/05/getgoing/,674.0,13.0,595.0,0.494098,1.0,0.706231,5.0,2.0,1.0,0.0,4.578151,8.0,0.0,0.0,1.0,0.0,0.0,0.0,217.0,643.0,410.250000,3200.0,617900.0,123887.500000,1440.307692,7675.076923,3123.999249,0.0,0.0,0.000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.787695,0.136638,0.025033,0.025067,0.025567,0.434557,0.250529,0.042017,0.003361,0.925926,0.074074,0.420762,0.050000,0.8,-0.400000,-0.600000,-0.200000,0.100000,0.000000,0.400000,0.000000,2000,1
2,http://mashable.com/2013/09/17/jumpstagram-pho...,478.0,12.0,189.0,0.735450,1.0,0.856000,4.0,3.0,0.0,1.0,4.862434,4.0,0.0,1.0,0.0,0.0,0.0,0.0,4.0,176.0,84.000000,11600.0,843300.0,278675.000000,1500.701493,3276.256926,2441.977896,1300.0,11600.0,5433.333333,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050003,0.799198,0.050138,0.050661,0.050000,0.513333,0.001667,0.026455,0.021164,0.555556,0.444444,0.677273,0.136364,1.0,-0.583333,-1.000000,-0.166667,1.000000,-1.000000,0.500000,1.000000,914,0
3,http://mashable.com/2014/05/06/aol-buys-conver...,247.0,9.0,618.0,0.575862,1.0,0.709677,14.0,3.0,0.0,11.0,4.645631,10.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,803.0,250.686667,0.0,843300.0,443570.000000,0.000000,85755.500000,12495.068718,1900.0,5800.0,3233.333333,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.420582,0.020216,0.020046,0.519155,0.020000,0.566894,0.178399,0.045307,0.011327,0.800000,0.200000,0.393220,0.050000,1.0,-0.340476,-0.666667,-0.050000,1.000000,-1.000000,0.500000,1.000000,1200,0
4,http://mashable.com/2013/02/17/tech-inspired-r...,690.0,10.0,1003.0,0.472505,1.0,0.632588,36.0,2.0,0.0,13.0,4.725823,6.0,0.0,0.0,0.0,0.0,0.0,0.0,217.0,397.0,333.333333,37400.0,69100.0,62816.666667,2662.860184,4038.441441,3396.760912,1400.0,4200.0,2800.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.034540,0.437341,0.033410,0.460561,0.034149,0.551878,0.248830,0.065803,0.015952,0.804878,0.195122,0.479343,0.100000,1.0,-0.348301,-1.000000,-0.050000,0.000000,0.000000,0.500000,0.000000,6400,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3959,http://mashable.com/2013/08/15/facebook-paypal/,511.0,12.0,375.0,0.524064,1.0,0.675439,6.0,4.0,1.0,0.0,5.088000,10.0,0.0,0.0,1.0,0.0,0.0,0.0,4.0,602.0,153.700000,7300.0,843300.0,270520.000000,1607.350000,4349.762646,2929.302946,1400.0,28500.0,14950.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.351851,0.020002,0.020081,0.020000,0.588065,0.406524,0.004905,0.018667,0.018667,0.500000,0.500000,0.259184,0.100000,0.5,-0.213095,-0.300000,-0.050000,0.454545,0.136364,0.045455,0.136364,2500,1
3960,http://mashable.com/2013/01/09/google-flu/,729.0,6.0,389.0,0.555556,1.0,0.725581,9.0,5.0,1.0,1.0,4.316195,7.0,0.0,0.0,0.0,0.0,0.0,1.0,217.0,495.0,402.333333,0.0,17100.0,2914.285714,0.000000,1953.144444,542.041043,1300.0,1300.0,1300.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.028725,0.028578,0.497388,0.028574,0.416735,0.614921,0.138499,0.056555,0.010283,0.846154,0.153846,0.352698,0.100000,1.0,-0.322222,-0.500000,-0.155556,0.000000,0.000000,0.500000,0.000000,5600,1
3961,http://mashable.com/2013/10/30/roomba-light-show/,435.0,11.0,1285.0,0.376959,1.0,0.576758,8.0,7.0,1.0,1.0,4.473152,10.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,562.0,160.333333,0.0,843300.0,157700.000000,0.000000,3436.073639,2322.588726,549.0,4800.0,1726.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.020088,0.020003,0.020003,0.020023,0.919882,0.516600,0.168892,0.046693,0.009339,0.833333,0.166667,0.367949,0.050000,1.0,-0.266667,-0.800000,-0.100000,0.000000,0.000000,0.500000,0.000000,6500,1
3962,http://mashable.com/2013/01/22/kate-upton-merc...,716.0,10.0,179.0,0.636364,1.0,0.739130,4.0,3.0,0.0,1.0,4.374302,7.0,0.0,0.0,0.0,0.0,0.0,0.0,217.0,577.0,385.285714,5000.0,51900.0,23471.428571,1532.954545,2985.708861,2328.353760,881.0,881.0,881.000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.224379,0.028842,0.028640,0.689564,0.028575,0.529221,0.162338,0.061453,0.016760,0.785714,0.214286,0.319008,0.033333,0.7,-0.416667,-0.800000,-0.150000,0.666667,0.333333,0.166667,0.333333,1000,0


In [10]:
df["date"] = pd.to_datetime(df["url"].str.split("/").str[3:6].str.join("/")) # Obtención de fecha

In [11]:
df["date"].dt.strftime("%Y-%m-01")

0       2013-04-01
1       2013-03-01
2       2013-09-01
3       2014-05-01
4       2013-02-01
           ...    
3959    2013-08-01
3960    2013-01-01
3961    2013-10-01
3962    2013-01-01
3963    2013-08-01
Name: date, Length: 3964, dtype: object

In [12]:
df["month"] = df["date"].dt.strftime("%Y-%m-01") # Generación de mes

In [13]:
sorted(df["month"].unique())

['2013-01-01',
 '2013-02-01',
 '2013-03-01',
 '2013-04-01',
 '2013-05-01',
 '2013-06-01',
 '2013-07-01',
 '2013-08-01',
 '2013-09-01',
 '2013-10-01',
 '2013-11-01',
 '2013-12-01',
 '2014-01-01',
 '2014-02-01',
 '2014-03-01',
 '2014-04-01',
 '2014-05-01',
 '2014-06-01',
 '2014-07-01',
 '2014-08-01',
 '2014-09-01',
 '2014-10-01',
 '2014-11-01',
 '2014-12-01']

In [14]:
validate = df[df["date"] >= datetime(2014, 6, 1)].reset_index(drop = True).copy() # Generación de validate por fecha
train = df.drop(index = validate.index).reset_index(drop = True).copy()

In [15]:
train = train[train[target] <= 32000].reset_index(drop = True) # Remoción de outlier en target

In [16]:
# Separación de datos
Xt = train[ls_cont]
yr = train[target]
yc = train[target_disc]

## Modelado 

#### Clasificador

In [17]:
logreg = LogisticRegression()

In [18]:
logreg.fit(Xt, yc)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
# Cross validation
ls_score = cross_val_score(estimator=logreg, X = Xt, y=yc, n_jobs=-1, scoring="roc_auc", cv = 4)

In [20]:
np.mean(ls_score), np.std(ls_score)

(0.6117698492432982, 0.01696194108397957)

In [22]:
validate["yc_hat"] = logreg.predict(validate[ls_cont]) # Predicción de si el artículo será exitoso o no

In [27]:
validate["yc_hat"]

0       1
1       0
2       1
3       0
4       0
       ..
1400    0
1401    0
1402    0
1403    0
1404    1
Name: yc_hat, Length: 1405, dtype: int64

### Regresión

In [28]:
yr

0       2300
1        574
2       2700
3        638
4       4400
        ... 
2526    2500
2527    5600
2528    6500
2529    1000
2530    1800
Name: shares, Length: 2531, dtype: int64

In [29]:
linreg = Ridge()

In [30]:
linreg.fit(Xt, yr)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [31]:
ls_score = cross_val_score(estimator=linreg, X = Xt, y=yr, n_jobs=-1, scoring="r2", cv = 4)

In [32]:
np.mean(ls_score), np.std(ls_score)

(0.047197233683346634, 0.017711455551925784)

In [33]:
validate["yr_hat"] = linreg.predict(validate[ls_cont]) # Generación de predicción

In [34]:
validate["yr_hat"]

0       4362.550413
1       1307.610513
2       2307.089708
3       1979.035009
4       2379.767211
           ...     
1400    1518.544691
1401    1371.283761
1402    1530.856755
1403    1980.853447
1404    2591.261522
Name: yr_hat, Length: 1405, dtype: float64

In [35]:
validate

Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,num_keywords,data_channel_is_lifestyle,data_channel_is_entertainment,data_channel_is_bus,data_channel_is_socmed,data_channel_is_tech,data_channel_is_world,kw_min_min,kw_max_min,kw_avg_min,kw_min_max,kw_max_max,kw_avg_max,kw_min_avg,kw_max_avg,kw_avg_avg,self_reference_min_shares,self_reference_max_shares,self_reference_avg_sharess,weekday_is_monday,weekday_is_tuesday,weekday_is_wednesday,weekday_is_thursday,weekday_is_friday,weekday_is_saturday,weekday_is_sunday,is_weekend,LDA_00,LDA_01,LDA_02,LDA_03,LDA_04,global_subjectivity,global_sentiment_polarity,global_rate_positive_words,global_rate_negative_words,rate_positive_words,rate_negative_words,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares,success,date,month,yc_hat,yr_hat
0,http://mashable.com/2014/10/17/turbo-wheelchar/,80.0,14.0,404.0,0.551020,1.0,0.618321,13.0,4.0,6.0,0.0,4.784653,7.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,765.0,232.857143,41600.0,843300.0,397642.857143,3447.585836,7992.437500,5382.803192,1900.0,9300.0,5600.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028698,0.232516,0.028572,0.680812,0.029402,0.430635,-0.081587,0.032178,0.027228,0.541667,0.458333,0.276923,0.100000,0.8,-0.573232,-0.900000,-0.125000,0.588889,0.166667,0.088889,0.166667,1500,1,2014-10-17,2014-10-01,1,4362.550413
1,http://mashable.com/2014/11/24/jurassic-world-...,44.0,13.0,301.0,0.531690,1.0,0.629630,4.0,3.0,1.0,1.0,4.491694,5.0,0.0,1.0,0.0,0.0,0.0,0.0,-1.0,517.0,113.000000,12500.0,843300.0,234120.000000,2094.176471,3407.184361,2656.353374,820.0,891.0,855.500000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040000,0.839894,0.040000,0.040105,0.040000,0.389031,0.102381,0.043189,0.026578,0.619048,0.380952,0.382051,0.250000,1.0,-0.283333,-0.400000,-0.166667,0.507071,0.023232,0.007071,0.023232,379,0,2014-11-24,2014-11-01,0,1307.610513
2,http://mashable.com/2014/06/23/american-idol-j...,199.0,8.0,738.0,0.508916,1.0,0.708716,28.0,9.0,4.0,2.0,4.710027,10.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,309.0,126.300000,26900.0,843300.0,321950.000000,2438.409420,4370.356505,3152.541032,1500.0,2800.0,1940.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020089,0.020012,0.020009,0.020018,0.919871,0.475180,0.134991,0.043360,0.016260,0.727273,0.272727,0.415294,0.033333,0.8,-0.310417,-0.500000,-0.125000,1.000000,-0.500000,0.500000,0.500000,3700,1,2014-06-23,2014-06-01,1,2307.089708
3,http://mashable.com/2014/09/26/isis-influence-...,103.0,12.0,1364.0,0.379258,1.0,0.561008,11.0,11.0,10.0,1.0,4.142962,8.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,687.0,234.375000,3900.0,843300.0,285837.500000,1217.025641,3475.248622,2588.589059,827.0,2500.0,1603.857143,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.025000,0.025051,0.025000,0.025000,0.899948,0.494337,0.137017,0.057185,0.024927,0.696429,0.303571,0.385282,0.050000,1.0,-0.298150,-0.800000,-0.050000,0.540404,-0.065657,0.040404,0.065657,2200,1,2014-09-26,2014-09-01,0,1979.035009
4,http://mashable.com/2014/07/28/goat-stuck-in-a...,164.0,10.0,666.0,0.451807,1.0,0.616402,5.0,3.0,2.0,0.0,4.936937,5.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,748.0,232.400000,4500.0,843300.0,345520.000000,1730.846154,5413.666667,3466.415837,1900.0,1900.0,1900.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040001,0.040001,0.040065,0.040001,0.839933,0.465843,0.118293,0.043544,0.030030,0.591837,0.408163,0.472419,0.100000,0.9,-0.261111,-0.600000,-0.100000,0.100000,0.000000,0.400000,0.000000,5800,1,2014-07-28,2014-07-01,0,2379.767211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1400,http://mashable.com/2014/12/12/united-airlines...,23.0,13.0,509.0,0.513919,1.0,0.603125,26.0,2.0,1.0,0.0,5.039293,7.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,902.0,179.600000,0.0,843300.0,317343.142857,0.000000,3400.887791,1556.008263,716.0,716.0,716.000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028572,0.028572,0.885711,0.028572,0.028573,0.559286,0.007619,0.021611,0.047151,0.314286,0.685714,0.370130,0.200000,0.5,-0.156944,-0.500000,-0.100000,0.433333,-0.144444,0.066667,0.144444,1100,0,2014-12-12,2014-12-01,0,1518.544691
1401,http://mashable.com/2014/11/17/supernatural-mi...,50.0,12.0,556.0,0.498092,1.0,0.647590,26.0,1.0,1.0,0.0,4.694245,3.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,0.0,-1.000000,141400.0,843300.0,534200.000000,2171.456721,3416.457322,2691.958962,26600.0,26600.0,26600.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.066667,0.733325,0.066667,0.066674,0.407662,0.141596,0.032374,0.021583,0.600000,0.400000,0.399285,0.100000,1.0,-0.237500,-0.500000,-0.050000,0.000000,0.000000,0.500000,0.000000,779,0,2014-11-17,2014-11-01,0,1371.283761
1402,http://mashable.com/2014/11/14/kenya-mydressmy...,52.0,10.0,199.0,0.637755,1.0,0.857143,2.0,2.0,1.0,0.0,4.874372,5.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,459.0,134.200000,6800.0,843300.0,320420.000000,1336.888889,3415.489909,2430.530305,12700.0,12700.0,12700.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.640292,0.040001,0.040001,0.040000,0.239706,0.528074,0.311494,0.040201,0.005025,0.888889,0.111111,0.398295,0.136364,0.8,-0.071429,-0.071429,-0.071429,0.000000,0.000000,0.500000,0.000000,1600,1,2014-11-14,2014-11-01,0,1530.856755
1403,http://mashable.com/2014/09/24/cash-smartwatch...,106.0,12.0,1097.0,0.420370,1.0,0.575486,29.0,15.0,1.0,2.0,4.969918,8.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,980.0,269.125000,2600.0,843300.0,235437.500000,1790.000000,4095.516245,2532.535305,873.0,2800.0,1834.600000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.025002,0.025002,0.769834,0.025001,0.155161,0.394420,0.233609,0.083865,0.008204,0.910891,0.089109,0.361612,0.033333,1.0,-0.291667,-0.500000,-0.100000,0.333333,0.366667,0.166667,0.366667,1900,1,2014-09-24,2014-09-01,0,1980.853447


## Estabilidad y performance

### PSI

In [37]:
dc_bins = {}
for feat in ls_cont:
    dc_bins.update({feat: pd.cut(Xt[feat], bins=3, retbins=True)[1]})

In [38]:
dc_bins

{'LDA_00': array([0.0172815 , 0.31878804, 0.61939276, 0.91999749]),
 'LDA_01': array([0.01728182, 0.31869151, 0.61919968, 0.91970785]),
 'LDA_02': array([0.0172801 , 0.31878723, 0.61939254, 0.91999785]),
 'LDA_03': array([0.01728021, 0.31877841, 0.61937482, 0.91997124]),
 'LDA_04': array([0.01728317, 0.32119172, 0.62419126, 0.92719081]),
 'abs_title_sentiment_polarity': array([-0.001     ,  0.33333333,  0.66666667,  1.        ]),
 'abs_title_subjectivity': array([-0.0005    ,  0.16666667,  0.33333333,  0.5       ]),
 'average_token_length': array([-7.97468354e-03,  2.65822785e+00,  5.31645570e+00,  7.97468354e+00]),
 'avg_negative_polarity': array([-1.001     , -0.66666667, -0.33333333,  0.        ]),
 'avg_positive_polarity': array([-0.0008    ,  0.26666667,  0.53333333,  0.8       ]),
 'global_rate_negative_words': array([-0.00013693,  0.04564315,  0.09128631,  0.13692946]),
 'global_rate_positive_words': array([-0.00013223,  0.04407713,  0.08815427,  0.1322314 ]),
 'global_sentiment

In [39]:
len(ls_cont)

44

In [43]:
pd.cut(Xt[feat], bins=dc_bins[feat]).value_counts(True).reset_index().rename(columns = {feat: f"{feat}_train"}) # Segmentación de variable continua

Unnamed: 0,index,n_tokens_title_train
0,"(7.0, 12.0]",0.766495
1,"(12.0, 17.0]",0.150533
2,"(1.985, 7.0]",0.082971


In [44]:
pd.cut(validate.loc[validate["month"] == mes, feat], bins=dc_bins[feat]).value_counts(True).reset_index().rename(columns = {feat: f"{feat}_valid"}) # Aplicación de segmentos a validación

Unnamed: 0,index,n_tokens_title_valid
0,"(7.0, 12.0]",0.697297
1,"(12.0, 17.0]",0.264865
2,"(1.985, 7.0]",0.037838


In [40]:
# Cálculo de PSI
for feat in dc_bins.keys():
    for mes in sorted(validate["month"].unique()):
        aux_t = pd.cut(Xt[feat], bins=dc_bins[feat]).value_counts(True).reset_index().rename(columns = {feat: f"{feat}_train"}) # Segmentación de variable continua
        aux_v = pd.cut(validate.loc[validate["month"] == mes, feat], bins=dc_bins[feat]).value_counts(True).reset_index().rename(columns = {feat: f"{feat}_valid"}) # Aplicación de segmentos a validación
        aux = aux_t.merge(aux_v, on = "index")
        aux["diff"] = aux[f"{feat}_valid"] - aux[f"{feat}_train"] # Diferencia en proporciones
        aux["log"] = np.log(aux[f"{feat}_valid"] / aux[f"{feat}_train"]) # Logaritmo del cociente de proporciones
        aux["PSI"] = aux["diff"] * aux["log"] # Cálculo de PSI por rango
        print(f'PSI of {feat} in {mes}: {aux["PSI"].sum()}') # Impresión de PSI total de la variable
    break

PSI of n_tokens_title in 2014-06-01: 0.015667277954755088
PSI of n_tokens_title in 2014-07-01: 0.04759585590476128
PSI of n_tokens_title in 2014-08-01: 0.027460622591979417
PSI of n_tokens_title in 2014-09-01: 0.03843564964186774
PSI of n_tokens_title in 2014-10-01: 0.08935640322151851
PSI of n_tokens_title in 2014-11-01: 0.11359997625031418
PSI of n_tokens_title in 2014-12-01: 0.10658652269453639


### Desempeño de poder predictivo 

In [46]:
for feat in ls_cont:
    for mes in sorted(validate["month"].unique()):
        kb = SelectKBest(k=1, score_func=f_classif) # Medición de poder predictivo
        kb.fit(validate.loc[validate["month"] == mes, [feat]], validate.loc[validate["month"] == mes, "shares"]) # Segmentación por mes
        print(f'Predictive power of {feat} in {mes}: {kb.scores_[0]}') # Muestra de poder predictivo
    break 

Predictive power of n_tokens_title in 2014-06-01: 0.9416357880905367
Predictive power of n_tokens_title in 2014-07-01: 0.9235593438033854
Predictive power of n_tokens_title in 2014-08-01: 1.1451015382651484
Predictive power of n_tokens_title in 2014-09-01: 0.7706311146078957
Predictive power of n_tokens_title in 2014-10-01: 0.7637900617623378
Predictive power of n_tokens_title in 2014-11-01: 1.1831651085783175
Predictive power of n_tokens_title in 2014-12-01: 1.0492023975884752


In [47]:
# Prueba de poder predictivo discreta (IV)
for feat in ls_cont:
    for mes in sorted(validate["month"].unique()):
        validate[f"C_{feat}"] = pd.cut(validate[feat], bins=dc_bins[feat])
        print(f'Predictive power of {feat} in {mes}: {IV(validate[validate["month"] == mes], f"C_{feat}", "success")}')
    break

Predictive power of n_tokens_title in 2014-06-01: 0.012847599708707493
Predictive power of n_tokens_title in 2014-07-01: 0.07239900057035413
Predictive power of n_tokens_title in 2014-08-01: 0.07748413970215284
Predictive power of n_tokens_title in 2014-09-01: 0.05510783176573454
Predictive power of n_tokens_title in 2014-10-01: 0.06906678271419386
Predictive power of n_tokens_title in 2014-11-01: 0.010765678543019653
Predictive power of n_tokens_title in 2014-12-01: 0.12348900866895923


### Estabilidad de las características 

In [48]:
# Estabilidad de características
for feat in dc_bins.keys():
    for mes in sorted(validate["month"].unique()):
        aux_t = pd.cut(Xt[feat], bins=dc_bins[feat]).value_counts(True).reset_index().rename(columns = {feat: f"{feat}_train"})
        aux_v = pd.cut(validate.loc[validate["month"] == mes, feat], bins=dc_bins[feat]).value_counts(True).reset_index().rename(columns = {feat: f"{feat}_valid"})
        aux = aux_t.merge(aux_v, on = "index")
        aux["diff"] = abs(aux[f"{feat}_valid"] - aux[f"{feat}_train"])
        print(f'Stability KS of {feat} in {mes}: {max(aux["diff"])}')
    break

Stability KS of n_tokens_title in 2014-06-01: 0.052209742055652764
Stability KS of n_tokens_title in 2014-07-01: 0.06202700045999654
Stability KS of n_tokens_title in 2014-08-01: 0.061901847147188355
Stability KS of n_tokens_title in 2014-09-01: 0.0695623077664709
Stability KS of n_tokens_title in 2014-10-01: 0.11937811841134532
Stability KS of n_tokens_title in 2014-11-01: 0.13041899493894754
Stability KS of n_tokens_title in 2014-12-01: 0.11433147885143147


### Performance 

In [49]:
# Evaluación de la precisión del modelo en "producción" (discreto)
for month in sorted(validate["month"].unique()):
    aux = validate[validate["month"] == month]
    print(month, roc_auc_score(y_score=aux["yc_hat"], y_true=aux["success"]))

2014-06-01 0.5520094562647754
2014-07-01 0.5869747899159664
2014-08-01 0.5858932461873638
2014-09-01 0.5907497565725414
2014-10-01 0.5635714285714286
2014-11-01 0.5511982570806101
2014-12-01 0.5867172211350293


In [None]:
# Evaluación de la precisión del modelo en "producción" (discreto)
for month in sorted(validate["month"].unique()):
    aux = validate[validate["month"] == month]
    print(month, r2_score(y_pred=aux["yr_hat"], y_true=aux["shares"]))

2014-06-01 0.07024789483329263
2014-07-01 0.04254027626184553
2014-08-01 0.2176554592639166
2014-09-01 0.04477424748926395
2014-10-01 0.023943431449284347
2014-11-01 0.017838620440488673
2014-12-01 0.054856684218126395
