In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import joblib

#Para NPL
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

#Métricas
import sklearn as sk
from sklearn import tree
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix, classification_report, make_scorer

#Configuración Warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

#Regressors
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

from sklearn.ensemble import RandomForestRegressor

from keras import backend as K
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold, KFold

#Análisis de Sentimientos
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
user_stories_train = pd.read_csv("/content/drive/My Drive/UserStories/train.csv")
user_stories_test = pd.read_csv("/content/drive/My Drive/UserStories/test.csv")

In [None]:
#aplicamos NPL a la col description

# Preprocesa texto y devuelve un texto tokenizado

def procesadorDeTexto(comment):
    # Convierte a minúsculas
    comment = comment.lower()

    # Elimina signos de puntuación
    #comment = comment.translate(str.maketrans('', '', string.punctuation))
    comment = comment.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))

    # Tokeniza el texto
    tokens = word_tokenize(comment)

    # Elimina palabras vacías (stopwords) y tokens cortos
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    return tokens

In [None]:
user_stories_train.head()

Unnamed: 0,id,title,description,project,storypoint
0,5660,Error enabling Appcelerator services during ap...,"When creating the default app, I encountered t...",project8,3
1,9014,Create a maintenance branch,"As a developer, I'd like to have a maintenance...",project6,5
2,4094,Service Activity Monitoring Backend integrated...,SAM API used by SAM GUI,project1,5
3,811,fs::enter(rootfs) does not work if 'rootfs' is...,I noticed this when I was testing the unified ...,project5,2
4,4459,transform processor with script option is broken,Creating the following stream throws exception...,project6,2


In [None]:
# Aplica la función de preprocesamiento a descripcion
user_stories_train['tokens'] = user_stories_train['description'].apply(procesadorDeTexto)

# Convierto cada lista de tokens en una cadena de texto
user_stories_train['texto'] = user_stories_train['tokens'].apply(lambda tokens: ' '.join(tokens))

In [None]:
# Aplica la función de preprocesamiento a descripcion
user_stories_test['tokens'] = user_stories_test['description'].apply(procesadorDeTexto)

# Convierto cada lista de tokens en una cadena de texto
user_stories_test['texto'] = user_stories_test['tokens'].apply(lambda tokens: ' '.join(tokens))

In [None]:
#Creamos un dataset con features a usar para clasificar
user_stories_x = user_stories_train['texto'].copy()

#Creamos un dataset con la variable target 'storypoint'
user_stories_y = user_stories_train['storypoint'].copy()

#Genero los conjuntos de train y test
x_train, x_test, y_train, y_test = train_test_split(user_stories_x, user_stories_y, test_size=0.2, random_state=42)

# **XGBoost**

Hiper-parámetros


* **learning_rate:** tasa de aprendizaje
* **max_depth:** máx profundidad de c/árbol
* **subsample:** porcentaje de muestras usadas para cada arbol (valor muy bajo, posbile underfittig)
*  **colsample_bytree:** porcentaje de features usadas para c/arbol (valores muy alto, poisble overfitting)
*  **n_estimators:** cant de arboles a construir
*  **objective:** funcion de error a utilizar (ejm: reg: squarederror, parar regresion, re_logistic o binary para clasificacion)

Parametros de regularización:

*  **gamma:** umbral para hacer split basado en la reducción de error de hacer el nuevo split
*  **alpha:** regularización para los pesos de las hojas. Un valor más alto genera una mayor regularización
*  **lambda:** similar alpha pero para la sintonia fina.






In [None]:
# Usamos TfidfVectorizer para convertir el texto en vectores numéricos
vectorizer = TfidfVectorizer()

x_train_vectorized = vectorizer.fit_transform(x_train)
x_test_vectorized = vectorizer.transform(x_test)

**Entrenando y prediciendo con hiper-parámetros por defecto**

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

In [None]:
model_xgb_regresion = xgb.XGBRegressor(objective = 'reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 10, random_state = 42)

In [None]:
# Entreno con el modelo
model_xgb_regresion.fit(x_train_vectorized, y_train)

In [None]:
# hace prediccion y evalua el modelo
y_pred = model_xgb_regresion.predict(x_test_vectorized)

In [None]:
# evaluo el modelo, a través de los errores que nos tira las predicciones
# Calcular el ERROR CUADRATICO MEDIO
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

# Calcular el R^2 (coeficiente de determinación)
r2 = r2_score(y_test, y_pred)
print(f"R^2: {r2}")

RMSE: 2.7512678261810044
R^2: 0.08338063955307007


**Entrenando y prediciendo con 100 árboles**

In [None]:
model_xgb_regresion = xgb.XGBRegressor(objective = 'reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 100)

In [None]:
# Entreno con el modelo
model_xgb_regresion.fit(x_train_vectorized, y_train)

In [None]:
# hace prediccion y evalua el modelo
y_pred = model_xgb_regresion.predict(x_test_vectorized)

In [None]:
# evaluo el modelo, a través de los errores que nos tira las predicciones
# Calcular el ERROR CUADRATICO MEDIO
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

# Calcular el R^2 (coeficiente de determinación)
r2 = r2_score(y_test, y_pred)
print(f"R^2: {r2}")

RMSE: 2.5514680548285273
R^2: 0.21167808771133423


**Entrenando y prediciendo con 100 árboles y máx profundidad de 4**

In [None]:
model_xgb_regresion = xgb.XGBRegressor(objective = 'reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 4, alpha = 10, n_estimators = 100)
model_xgb_regresion.fit(x_train_vectorized, y_train)
y_pred = model_xgb_regresion.predict(x_test_vectorized)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

# Calcular el R^2 (coeficiente de determinación)
r2 = r2_score(y_test, y_pred)
print(f"R^2: {r2}")

RMSE: 2.5816960988571624
R^2: 0.19288849830627441


**K-fold cross validation:** modelo cv

Hiperparametros

*   **nfolds:** cant de particiones que se hace
*   **num_boost_round:** cantidad de arboles a construir (n-estimators)
*   **metrics:** metrica a usar
*   **as_pandas:** si los resultados lo devuelve en un dataframe de pandas
*   **early_stopping_rounds:** terminar antes si la metrica no mejora luego de una cantidad de n pasadas
*   **seed:** semilla




In [None]:
# convertimos los datos en matrix
data_dmatrix = xgb.DMatrix(data=x_train_vectorized,label=y_train)

In [None]:
#Entrenamos

# hiperparametros de xgboost
params = {
           'objective' :'reg:squarederror',
           'colsample_bytree' : 0.3,
           'learning_rate': 0.1,
           'max_depth' : 15,
           'alpha' : 10,
           'n_estimators' : 300,
           'random_state' : 42
           }

# hiperparametros de k-fold
cv_results = xgb.cv(
                      dtrain=data_dmatrix,
                      params=params,
                      nfold=5,
                      num_boost_round=1000,
                      metrics='rmse',
                      as_pandas=True,
                      early_stopping_rounds=10,  # Detener si no hay mejora después de 10 rondas
                      verbose_eval=10,  # Mostrar el progreso cada 10 rondas
                    )

[0]	train-rmse:2.99960+0.04651	test-rmse:3.03447+0.19122
[10]	train-rmse:2.47864+0.03222	test-rmse:2.85672+0.19655
[20]	train-rmse:2.16019+0.02923	test-rmse:2.80222+0.20469
[30]	train-rmse:1.95144+0.02914	test-rmse:2.78272+0.21051
[40]	train-rmse:1.80548+0.02489	test-rmse:2.77213+0.21664
[50]	train-rmse:1.69312+0.02745	test-rmse:2.76837+0.21380
[60]	train-rmse:1.60869+0.02964	test-rmse:2.76141+0.21688
[70]	train-rmse:1.53475+0.03392	test-rmse:2.75845+0.22000
[78]	train-rmse:1.48295+0.03004	test-rmse:2.75821+0.22040


In [None]:
cv_results.head()

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,3.040178,0.045324,3.048872,0.191138
1,3.006847,0.044215,3.028134,0.190199
2,2.986496,0.043423,3.013657,0.188934
3,2.963705,0.043168,3.001032,0.189994
4,2.940882,0.043306,2.987432,0.190366


In [None]:
# el mejor nro de iteraciones es 227 con un error de 2.781422
error_final = (cv_results['test-rmse-mean']).tail(1)
error_final

Unnamed: 0,test-rmse-mean
69,2.757629


In [None]:
best_iteration = cv_results['test-rmse-mean'].idxmin()
best_iteration

227

In [None]:
#Entrenamos el modelo con los mejores hiperparametros
final_model = xgb.train(
                        params=params,
                        dtrain=data_dmatrix,
                        num_boost_round=best_iteration
                        )

In [None]:
# Convertir los datos de prueba a DMatrix
dtest = xgb.DMatrix(x_test_vectorized)

# Predecir usando el modelo final
y_pred = final_model.predict(dtest)

#Evaluo el modelo usando RMSE en el conjunto de prueba
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE en el conjunto de prueba: {rmse}")

RMSE en el conjunto de prueba: 2.528934527191862


**con los valores encontrados, aplicamos al test**

In [None]:
user_stories_test_x = user_stories_test['texto'].copy()

In [None]:
vectorizer = TfidfVectorizer()

user_stories_test_vectorized = vectorizer.fit_transform(user_stories_test_x)

In [None]:
# Llevo los datos a formato DMatrix
dtest_final = xgb.DMatrix(user_stories_test_vectorized)

In [None]:
# Predecir usando el modelo final
y_pred_test = final_model.predict(dtest_final)

In [None]:
y_pred_test

array([2.9695022, 2.9695022, 2.9695022, ..., 2.9695022, 2.9695022,
       2.9695022], dtype=float32)

**Generamos el dataset de predicción para submitir a kaggle**

In [None]:
# genero el dataset id - storypoint:
submition = pd.DataFrame({'id': user_stories_test['id'], 'storypoint': y_pred_test})
print(submition.shape)
submition

(1975, 2)


Unnamed: 0,id,storypoint
0,3433,2.969502
1,106,2.969502
2,7182,2.969502
3,8985,2.969502
4,2149,2.969502
...,...,...
1970,9069,2.969502
1971,3100,2.969502
1972,6648,2.969502
1973,6076,2.969502


In [None]:
submition.to_csv('/content/drive/MyDrive/UserStories/xgboost_submit2.csv', index=False)