In [2]:
pip install lightgbm

Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import matplotlib.pyplot as plt
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import mean_squared_error

In [6]:
# Afficher le répertoire courant
print("Répertoire courant :", os.getcwd())

Répertoire courant : C:\Users\flero\Downloads


In [8]:
# Changer de répertoire (remplacez le chemin par celui de votre dossier)
os.chdir(r'D:\HEC\Msc BI\Data Mining\Devoir Hiver')
print("Nouveau répertoire :", os.getcwd())

Nouveau répertoire : D:\HEC\Msc BI\Data Mining\Devoir Hiver


In [10]:

# Importation des données
# Ici, nous utilisons pd.read_csv() pour lire les fichiers texte.
# L'argument sep=r'\s+' précise que les colonnes sont séparées par des espaces.
mobiletrain = pd.read_csv("train.csv", sep=',')
mobiletest  = pd.read_csv("test.csv", sep=',')

# Affichage des dimensions du DataFrame mobiletrain
# L'attribut .shape renvoie un tuple (nombre de lignes, nombre de colonnes)
print("Dimensions de mobiletrain :", mobiletrain.shape)


Dimensions de mobiletrain : (5000, 17)


In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import xgboost as xgb
import xgboost as xgb
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [14]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.multiclass import OneVsOneClassifier


In [20]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.multiclass import OneVsOneClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# --- Nettoyage des colonnes ---
mobiletrain.columns = mobiletrain.columns.str.strip()
mobiletest.columns = mobiletest.columns.str.strip()

# --- Feature Engineering ---
def add_features(df):
    df['avg_score_per_session'] = df['totscore'] / (df['numsessions'] + 1)
    df['avg_purchase_per_session'] = df['totpurchases'] / (df['numsessions'] + 1)
    df['playtime_per_day'] = df['totplaytime'] / (df['numdays'] + 1)
    df['score_per_day'] = df['totscore'] / (df['numdays'] + 1)
    df['purchase_efficiency'] = df['totpurchases'] / (df['totscore'] + 1)
    df['lives_per_day'] = df['numlives'] / (df['numdays'] + 1)
    df['elements_per_session'] = df['numelements'] / (df['numsessions'] + 1)
    df['difficulty_ratio'] = df['difflevel'] / (df['skill1'] + df['skill2'] + 1)
    df['acquisition_trend_interaction'] = df['acquis'] * df['trendpurchase']
    df['platform_trend_combo'] = df['opsys'] * df['trendsession']
    return df

mobiletrain = add_features(mobiletrain)
mobiletest = add_features(mobiletest)

# --- Préparation des données ---
X = pd.get_dummies(mobiletrain.drop(columns=['y']))
y = mobiletrain['y']
X_test = pd.get_dummies(mobiletest.drop(columns=['id']))
X_test = X_test.reindex(columns=X.columns, fill_value=0)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# --- Split pour train/meta-model ---
X_train, X_meta, y_train, y_meta = train_test_split(X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42)

# --- 1. XGBoost Multiclass ---
xgb_multi = xgb.XGBClassifier(
    objective='multi:softprob',
    num_class=4,
    eval_metric='mlogloss',
    n_estimators=734,
    learning_rate=0.17687238367091132,
    max_depth=5, 
    min_child_weight=9,
    subsample=0.7167772586450419,
    colsample_bytree=0.8636447281342982,
    gamma=0.9995326576561295,
    use_label_encoder=False,
    random_state=42
)
xgb_multi.fit(X_train, y_train)
multi_proba = xgb_multi.predict_proba(X_meta)

# --- 2. XGBoost OvO ---
base_model_ovo = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    n_estimators=734,
    learning_rate=0.17687238367091132,
    max_depth=5, 
    min_child_weight=9,
    subsample=0.7167772586450419,
    colsample_bytree=0.8636447281342982,
    gamma=0.9995326576561295,
    use_label_encoder=False,
    random_state=42
)
ovo_model = OneVsOneClassifier(base_model_ovo)
ovo_model.fit(X_train, y_train)
ovo_proba = ovo_model.predict(X_meta).reshape(-1, 1)  # shape = (n_samples, 4)

# --- 3. Concaténation des prédictions ---
meta_features = np.hstack([multi_proba, ovo_proba])  # 8 features = 4 proba + 4 proba

# --- 4. Entraînement du méta-modèle ---
meta_model = LogisticRegression(max_iter=200)
meta_model.fit(meta_features, y_meta)

# --- 5. Générer les prédictions finales sur X_test ---
# Prédictions base models sur X_test
multi_test_proba = xgb_multi.predict_proba(X_test)
ovo_test_proba = ovo_model.predict(X_test).reshape(-1, 1)

# Concaténation
meta_test_input = np.hstack([multi_test_proba, ovo_test_proba])

# Prédictions finales
y_test_pred_encoded = meta_model.predict(meta_test_input)
y_test_pred = label_encoder.inverse_transform(y_test_pred_encoded)

# --- 6. Création du fichier de soumission ---
submission_stacked = pd.DataFrame({
    'id': mobiletest['id'],
    'Prediction': y_test_pred
})

submission_stacked.to_csv("submission_stacked.csv", index=False)

print("✅ Fichier 'submission_stacked.csv' généré avec succès 🎉")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Fichier 'submission_stacked.csv' généré avec succès 🎉
