## Loading Data

In [None]:
import optuna
import pandas as pd

train_df_copy = pd.read_csv("cleaned_train_only_dropped_y.csv")

# Load test data (input features only)
test_df = pd.read_csv("app-rating-competition/test.csv")
sample_submission = pd.read_csv("app-rating-competition/SampleSubmission.csv")
df_test_fr = pd.read_csv("extracted_test_rows_with_Y.csv")


In [None]:
train_df_copy

* Maybe try stratified sampling on review groups

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(train_df_copy, test_size=0.2, random_state=42)

In [None]:
train_set.shape

In [None]:
test_set.shape

|* For imputer try knn and iterative
* Our missing data is MNAR

## Pipeline

In [None]:
from preprocessing import *

column_transform = ColumnTransformer(
    [
        ("categories", category_pipeline(), ["app_category", "free_paid"]),
        ("size", box_cox_pipeline(), ["size_unknown_unit"]),
        ("downloads", downloads_pipeline(), ["downloads_unstandardized"]),
        ("reviews", reviews_pipeline(), ["reviews_count"]),
        ("price", price_pipeline(), ["price_if_paid"]),
        ("age_rating", age_rating_pipeline(), ["age_rating"]),
        ("dates", release_date_pipeline(), ["release_date"]),
        ("os", os_version_pipeline(), ["compatible_os_version"]),
    ],
    remainder="drop",
)

In [None]:
column_transform

* Currently there's still errors in mapping and numpy in the pipeline

In [None]:
df = train_set.copy()
X_train = df.drop(columns=['Y'])
y_train = df['Y']

## XGB Model

In [None]:
import numpy as np
from sklearn.feature_selection import SelectFromModel
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import scipy.stats as stats
from joblib import Memory

# ----- models -----
xgb_final = XGBRegressor(
    random_state=42,
    n_estimators=2000,  # large ceiling
    objective='reg:squarederror',
    n_jobs=4,
)

xgb_selector_base = XGBRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=4,
    random_state=42,
    n_jobs=4,
)

feature_selector = SelectFromModel(
    estimator=xgb_selector_base,
    threshold="median"
)

xgb_model_transformed = TransformedTargetRegressor(
    regressor=xgb_final,
    func=np.log1p,
    inverse_func=np.expm1
)

pipe = Pipeline([
    ("preprocessing", column_transform),
    # ("select", feature_selector),
    ("regression", xgb_model_transformed),
])

# ----- hyper‑param space -----
param_dist = {
    # # selector hyper‑params
    # "select__threshold": ["median", "1.5*mean", 0.002, 0.01],
    # "select__estimator__max_depth": stats.randint(3, 10),
    # "select__estimator__n_estimators": stats.randint(50, 300),
    # "select__estimator__learning_rate": stats.loguniform(0.03, 0.3),

    # final XGB hyper‑params
    'regression__regressor__max_depth': stats.randint(4, 9),
    'regression__regressor__learning_rate': stats.loguniform(0.02, 0.15),
    'regression__regressor__subsample': stats.uniform(0.6, 0.4),  # 0.6–1.0
    'regression__regressor__colsample_bytree': stats.uniform(0.6, 0.4),
    'regression__regressor__min_child_weight': stats.randint(1, 8),
    'regression__regressor__gamma': stats.uniform(0, 2),
    # lambdas rarely >2 are useful
    'regression__regressor__reg_lambda': stats.loguniform(1e-2, 2),
    # 'regression__regressor__early_stopping_rounds': [50],
    'regression__regressor__eval_metric': ['rmse'],
}


def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


rmse_scorer = make_scorer(rmse, greater_is_better=False)
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

random_search = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist,
    n_iter=200,
    cv=4,
    scoring=mse_scorer,
    n_jobs=8,
    verbose=2,
    random_state=42,
)

random_search.fit(X_train, y_train)

print("Best params:", random_search.best_params_)
print("Best CV RMSE:", -random_search.best_score_)
# print("Selected features:", random_search.best_estimator_
#       .named_steps["select"]
#       .get_support()
#       .sum())


In [None]:
df_test = test_set.copy()
X_test = df_test.drop(columns=['Y'])
y_test = df_test['Y']

In [None]:
df_test_seet = df_test_fr.copy()
X_test_set = df_test_seet.drop(columns=['Y'])
y_test_set = df_test_seet['Y']

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

best_pipeline = random_search.best_estimator_

# # Fit on training set
best_pipeline.fit(X_train, y_train)

# Predict on test set
y_pred = best_pipeline.predict(X_test_set)

mse = mean_squared_error(y_test_set, y_pred)
mae = mean_absolute_error(y_test_set, y_pred)
r2 = r2_score(y_test_set, y_pred)

# Output
print(f"Test MSE: {mse:.4f}")
print(f"Test MAE: {mae:.4f}")
print(f"Test R²: {r2:.4f}")
# print(f"Test RMSE: {rmse(y_test_set, y_pred):.4f}")



In [None]:
print("Y variance:", y_train.var())


## Error Analysis

In [None]:
# ------------------------------------------------------------
# 0.  Setup ― run once
# ------------------------------------------------------------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error

plt.style.use("default")  # no seaborn → competition‑safe

# y_true, y_pred, X_test must already exist
residuals = y_test - y_pred
abs_error = np.abs(residuals)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
print(f"RMSE = {rmse:.4f}   |   MAE = {mae:.4f}")


In [None]:
plt.figure(figsize=(5, 5))
plt.scatter(y_test, y_pred, s=8, alpha=0.5)
plt.plot([y_test.min(), y_test.max()],
         [y_test.min(), y_test.max()],
         ls="--")
plt.xlabel("True rating")
plt.ylabel("Predicted rating")
plt.title("Residual spread")
plt.tight_layout()
plt.show()


In [None]:
bins = pd.qcut(y_test, q=10, labels=False, duplicates="drop")
bin_rmse = (
    pd.DataFrame({"true": y_test, "pred": y_pred, "bin": bins})
    .groupby("bin")
    .apply(lambda df: np.sqrt(mean_squared_error(df["true"], df["pred"])))
)

plt.figure()
plt.bar(bin_rmse.index.astype(str), bin_rmse.values)
plt.xlabel("Y decile")
plt.ylabel("RMSE")
plt.title("RMSE across rating deciles")
plt.tight_layout()
plt.show()


In [None]:
plt.hist(residuals, bins=20)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Histogram of Residuals')
plt.show()


In [None]:
import statsmodels.api as sm

# create Q-Q plot with 45-degree line added to plot
fig = sm.qqplot(residuals, fit=True, line="45", alpha=0.2)
plt.show()

## Cleanlab

## XGB Submission

In [None]:
test_df

In [None]:


# 2. Predict using trained pipeline
y_pred_submission = best_pipeline.predict(test_df)

# 4. Create final submission by replacing Y column
sample_submission["Y"] = y_pred_submission

# 5. Save to CSV
sample_submission.to_csv("one_try.csv", index=False)

print("✅ submission.csv created successfully with row_id and rounded Y.")



## LGB Model

In [None]:
def rename_columns(dataframe):
    reformated_df = dataframe.rename(
        columns={"X0": "app_name", "X1": "app_category", "X2": "reviews_count", "X3": "size",
                 "X4": "installs_count", "X5": "free_paid", "X6": "price_if_paid", "X7": "age_rating",
                 "X8": "app_tags", "X9": "last_updated", "X10": "app_version",
                 "X11": "compatible_os_version"})
    return reformated_df

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 1) Load your files
#   - test.csv has the true Y column
#   - preds.csv is the file you just created with row_id and Y
test = pd.read_csv("extracted_test_rows_with_Y.csv")
preds = pd.read_csv("stacked_nlp_try2.csv")  # or whatever your file is named

# 2) Sanity‐check shapes
assert len(test) == len(preds), f"Length mismatch: test={len(test)} vs preds={len(preds)}"

# 3) Extract true & predicted arrays (ignore row_id)
y_true = test["Y"].values
y_pred = preds["Y"].values

# 4) Compute metrics
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

print(f"MSE : {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE : {mae:.4f}")
print(f"R²  : {r2:.4f}")


MSE : 0.1892
RMSE: 0.4350
MAE : 0.2941
R²  : 0.2328


In [None]:
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
import scipy.stats as stats
import optuna

# 1) Split off a local hold‐out if you don’t already have one
X_full = train_df_copy.drop(columns=["Y"])
y_full = train_df_copy["Y"]
X_tr, X_val, y_tr, y_val = train_test_split(
    X_full, y_full, test_size=0.1, random_state=42
)



# 2) Build the pipeline
lgb = LGBMRegressor(
    objective="regression",
    # device="gpu",
    random_state=42,
    n_jobs=4
)

# lgb_model_transformed = TransformedTargetRegressor(
#     regressor=lgb,
#     func=np.log1p,
#     inverse_func=np.expm1
# )

pipe_lgb = Pipeline([
    ("preprocessing", column_transform),
    ("reg", lgb)
])

# 3) Hyperparameter space
param_dist_lgb = {
    "reg__regressor__n_estimators": stats.randint(200, 1000),
    "reg__regressor__learning_rate": stats.loguniform(0.01, 0.2),
    "reg__regressor__num_leaves": stats.randint(20, 256),
    "reg__regressor__min_child_samples": stats.randint(5, 100),
    "reg__regressor__subsample": stats.uniform(0.6, 0.4),
    "reg__regressor__colsample_bytree": stats.uniform(0.6, 0.4),
    "reg__regressor__reg_alpha": stats.loguniform(1e-4, 1),
    "reg__regressor__reg_lambda": stats.loguniform(1e-4, 1),
}


# 4) RMSE scorer
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


rmse_scorer = make_scorer(rmse, greater_is_better=False)
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)


# lgb_search.fit(X_tr, y_tr)
# 5) Randomized search
lgb_search = RandomizedSearchCV(
    pipe_lgb,
    param_distributions=param_dist_lgb,
    n_iter=100,
    cv=5,
    scoring=mse_scorer,
    n_jobs=8,
    random_state=42,
    verbose=1
)
lgb_search.fit(X_tr, y_tr)

print("Best LGB params:", lgb_search.best_params_)
print("Best CV RMSE  :", -lgb_search.best_score_)

# 6) Evaluate on hold-out
best_lgb = lgb_search.best_estimator_
y_pred = best_lgb.predict(X_val)

mse = mean_squared_error(y_val, y_pred)
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
rm = np.sqrt(mse)

print(f"Hold-out MSE : {mse:.4f}")
print(f"Hold-out RMSE: {rm:.4f}")
print(f"Hold-out MAE : {mae:.4f}")
print(f"Hold-out R²  : {r2:.4f}")


In [None]:
y_pred = best_lgb.predict(X_test_set)

mse = mean_squared_error(y_test_set, y_pred)
mae = mean_absolute_error(y_test_set, y_pred)
r2 = r2_score(y_test_set, y_pred)
rm = np.sqrt(mse)

print(f"Hold-out MSE : {mse:.4f}")
print(f"Hold-out RMSE: {rm:.4f}")
print(f"Hold-out MAE : {mae:.4f}")
print(f"Hold-out R²  : {r2:.4f}")

In [None]:


# 2. Predict using trained pipeline
y_pred_submission = best_lgb.predict(test_df)

# 4. Create final submission by replacing Y column
sample_submission["Y"] = y_pred_submission

# 5. Save to CSV
sample_submission.to_csv("lgb_try.csv", index=False)

print("✅ submission.csv created successfully with row_id and rounded Y.")



In [None]:
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge, ElasticNetCV
from sklearn.model_selection import RepeatedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error

# 1) Define base learners
estimators = [
    ("lgb", lgb_search.best_estimator_),
    ("xgb", random_search.best_estimator_),
    # ("cat", CatBoostRegressor(
    #             loss_function="RMSE",
    #             iterations=800,
    #             learning_rate=0.05,
    #             depth=6,
    #             random_state=42,
    #             verbose=0)),
    ("lin", Pipeline([
                ("pre",   column_transform),
                ("scale", StandardScaler(with_mean=False)),
                ("reg",   ElasticNetCV(
                              alphas=np.logspace(-4,1,30),
                              l1_ratio=[.1,.5,.9,1],
                              cv=5,
                              n_jobs=-1))
    ]))
]

# 2) Meta-learner grid
meta = RidgeCV(alphas=np.logspace(-4, 1, 20), cv=5)

# 3) Stacking regressor
stack = StackingRegressor(
    estimators=estimators,
    final_estimator=meta,
    passthrough=True,
    cv=RepeatedKFold(n_splits=5, n_repeats=2, random_state=42),
    n_jobs=8
)

# 4) Fit & evaluate
X_full = train_df_copy.drop(columns=["Y"])
y_full = train_df_copy["Y"]
stack.fit(X_full, y_full)

cv_scores = cross_val_score(
    stack,
    X_full,
    y_full,
    cv=RepeatedKFold(n_splits=5, n_repeats=2, random_state=42),
    scoring=make_scorer(lambda y,t: np.sqrt(mean_squared_error(y,t)), greater_is_better=False),
    n_jobs=8
)
print(f"Improved Stack RMSE: {-cv_scores.mean():.4f}")


In [None]:
y_pred = stack.predict(X_test_set)

mse = mean_squared_error(y_test_set, y_pred)
mae = mean_absolute_error(y_test_set, y_pred)
r2 = r2_score(y_test_set, y_pred)
rm = np.sqrt(mse)

print(f"Hold-out MSE : {mse:.4f}")
print(f"Hold-out RMSE: {rm:.4f}")
print(f"Hold-out MAE : {mae:.4f}")
print(f"Hold-out R²  : {r2:.4f}")