# Importar Liberías

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
pd.set_option('display.max_columns',None)
import numpy as np

from sklearn.ensemble import RandomForestRegressor

# Carga de los datos

In [2]:
data_train = pd.read_csv(r'UH_2023/UH_2023_TRAIN.txt',sep='|')

# Preparación de datos

In [3]:
# cambiar el formato de las fechas

data_train.CAMPAÑA=list(map(lambda x:x+2000,data_train.CAMPAÑA))
data_train.CAMPAÑA = pd.to_datetime(data_train.CAMPAÑA,format='%Y').dt.year

# Cambiar los nombres de los columns
data_train.rename(columns={'CAMPAÑA':'date'},inplace=True)

In [4]:
# cambiar los nombres de variables en minúsculas
data_train.columns = data_train.columns.str.lower()

In [5]:
# Manejo de valores nulos de la superficie

data_train['superficie'].replace(0,np.nan,inplace=True)
data_train['superficie'] = data_train['superficie'].fillna(method='bfill')

In [6]:
# Manejo de valores nulos en la altitud

data_train.altitud.fillna(method='bfill',inplace=True)

In [7]:
# Cambiar la codificación de las variables categóricas a numeros medias

column = data_train.altitud 
mask = column.str.contains(r'\d+-\d+') 
numbers = column[mask].str.split('-', expand=True) 
numbers = numbers.astype(int) 
average = numbers.mean(axis=1) 
data_train.loc[mask, 'altitud'] = average 

print(data_train.altitud) 

0         660
1         660
2         520
3         520
4       655.0
        ...  
9596      700
9597      700
9598      700
9599      700
9600      700
Name: altitud, Length: 9601, dtype: object


# Separar data pred, data train y test

In [8]:
pred_data = data_train[data_train.produccion.isna()]

In [9]:
data_train_ori = data_train.iloc[0:7378,:]

In [10]:
# Top 6 features + variable objeto

features_rt = ['id_finca','superficie','variedad','id_zona','altitud','modo','produccion']
data_train_rt = data_train_ori[features_rt]
pred_data_rt = pred_data[features_rt]

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split (data_train_rt.drop('produccion',axis=1), 
                                                     data_train_rt.produccion,
                                                     test_size = 0.2,
                                                     random_state = 23)

In [12]:
from sklearn import metrics

def eva_score(test,pred):
    mse = metrics.mean_squared_error(test, pred) 
    print('MSE of this modelo is:', mse)
    rmse = np.sqrt(mse) 
    print('RMSE of this modelo is :', rmse)
    r2 = metrics.r2_score(test, pred) 
    print('R-squared of this modelo is:', r2)

In [13]:
mod_rt = RandomForestRegressor().fit(X_train,y_train)
pred_rt = mod_rt.predict(X_test)

eva_score(y_test,pred_rt)

MSE of this modelo is: 36307524.34428337
RMSE of this modelo is : 6025.57253248879
R-squared of this modelo is: 0.7547217386339331


# Optimizador de hiperparametros

In [14]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 5, 10, 20],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_split':[2,10],
    'min_samples_leaf':[1,10]
}

grid_search = GridSearchCV(mod_rt,param_grid,scoring='neg_root_mean_squared_error',cv=5,verbose=2)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 5 folds for each of 192 candidates, totalling 960 fits
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.4s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.3s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.3s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.3s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.3s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_e

In [14]:
mod_rt_best = RandomForestRegressor(n_estimators=300,max_depth=None,max_features='log2',min_samples_split=2,min_samples_leaf=1).fit(X_train,y_train)
pred_rt_best = mod_rt_best.predict(X_test)

eva_score(y_test,pred_rt_best)

MSE of this modelo is: 34855043.872755855
RMSE of this modelo is : 5903.816043268613
R-squared of this modelo is: 0.7645340816994146


In [16]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import BaggingRegressor

extra_tree = ExtraTreesRegressor(criterion='absolute_error',random_state=0)
mod_et = BaggingRegressor(extra_tree, random_state=0).fit(X_train, y_train)
y_pred_et = mod_et.predict(X_test)


eva_score(y_test,y_pred_et)

MSE of this modelo is: 35602717.84351503
RMSE of this modelo is : 5966.801307527764
R-squared of this modelo is: 0.7594831129283822


# Predecir datos 

In [15]:
pred_data_rt['produccion'] = mod_rt_best.predict(pred_data_rt.drop('produccion',axis=1))


In [16]:
pred_data_rt['color'] = pred_data.color
pred_data_rt['tipo'] = pred_data.tipo

pred_data_rt.drop(['id_zona','altitud'],axis=1,inplace=True)
pred_data_rt=pred_data_rt.reindex(columns=['id_finca','variedad','modo','tipo','color','superficie','produccion'])
pred_data_rt.sort_values(['id_finca','variedad','modo','tipo','color','superficie'],ascending=True,inplace=True)


In [17]:
pred_data_rt

Unnamed: 0,id_finca,variedad,modo,tipo,color,superficie,produccion
9240,439,9,2,0,0,1.0800,7312.950635
9231,447,40,2,0,1,0.4694,4986.402557
8788,523,32,2,0,1,0.6478,2490.226179
9581,702,59,2,0,1,1.1331,48162.215330
8746,797,59,1,0,1,7.5143,19987.117502
...,...,...,...,...,...,...,...
9549,99282,59,2,0,1,1.6244,7877.600574
9310,99377,52,2,0,1,0.6500,2716.986440
8655,99693,81,1,0,1,6.3397,25201.981245
8540,99793,52,2,0,1,0.1326,4399.846410


In [18]:
pred_data_rt.to_csv('Universidad Complutense de Madrid (UCM)_Team Turing.txt',sep='|',encoding='utf-8',index=False, header=False,float_format='%.2f')