# Preprocessing Notebook

In [1]:
import pandas as pd

from preprocess_app_name import preprocess_app_name

train_csv = pd.read_csv("app-rating-competition/train.csv")
test_csv = pd.read_csv("app-rating-competition/test.csv")
sample_submission_csv = pd.read_csv("app-rating-competition/SampleSubmission.csv")
actual_test_csv = pd.read_csv("extracted_test_rows_with_Y.csv")

In [2]:
def rename_columns(dataframe):
    reformated_df = dataframe.rename(
        columns={"X0": "app_name", "X1": "app_category", "X2": "reviews_count", "X3": "size",
                 "X4": "installs_count", "X5": "free_paid", "X6": "price_if_paid", "X7": "age_rating",
                 "X8": "app_tags", "X9": "last_updated", "X10": "app_version",
                 "X11": "compatible_os_version"})
    return reformated_df

In [3]:
actual_test_csv = rename_columns(actual_test_csv)

Unnamed: 0,app_name,app_category,reviews_count,size,installs_count,free_paid,price_if_paid,age_rating,app_tags,last_updated,app_version,compatible_os_version,Y
0,Girls hairstyles 2018,BEAUTY,62,3.1M,"10,000+",Free,0,Everyone,Beauty,"May 28, 2018",1.9.2,4.0.3 and up,4.2
1,Dairy Queen,FOOD_AND_DRINK,742,43M,"100,000+",Free,0,Everyone,Food & Drink,"July 25, 2018",2.1.0,4.1 and up,3.6
2,Remote Control For All AC - Universal Remote,PRODUCTIVITY,166,6.1M,"10,000+",Free,0,Everyone,Productivity,"July 28, 2018",1.1,4.0.3 and up,2.4
3,Ultimate Chest Tracker,PRODUCTIVITY,40328,23M,"1,000,000+",Free,0,Everyone,Productivity,"October 17, 2016",1.9,2.3 and up,4.5
4,Mobilight-BM,FINANCE,6,6.2M,500+,Free,0,Everyone,Finance,"August 3, 2018",2.0.10,4.0.3 and up,4.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1868,FREE Stock Market Trading Tips,FINANCE,714,3.6M,"50,000+",Free,0,Everyone,Finance,"June 7, 2017",2.10.3,2.3.3 and up,3.9
1869,Next Portuguese(BR) Langpack,TOOLS,1320,778k,"100,000+",Free,0,Everyone,Tools,"September 13, 2013",1.0,2.0 and up,4.3
1870,850 Sports News Digest,SPORTS,539,Varies with device,"10,000+",Free,0,Everyone,Sports,"August 23, 2016",Varies with device,Varies with device,4.6
1871,Lite for Facebook Messenger,COMMUNICATION,76498,4.3M,"1,000,000+",Free,0,Teen,Communication,"June 20, 2018",6.3.2,Varies with device,4.3


In [4]:
train_df = rename_columns(train_csv)
train_df = train_df[train_df['Y'] <= 5]
train_df["Y"].fillna(train_df["Y"].median())
train_df = train_df.drop_duplicates()
train_df = train_df.dropna()
train_df

Unnamed: 0,app_name,app_category,reviews_count,size,installs_count,free_paid,price_if_paid,age_rating,app_tags,last_updated,app_version,compatible_os_version,Y
0,Coloring book moana,ART_AND_DESIGN,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,3.9
1,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,4.7
2,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,4.3
3,Paper flowers instructions,ART_AND_DESIGN,167,5.6M,"50,000+",Free,0,Everyone,Art & Design,"March 26, 2017",1.0,2.3 and up,4.4
4,Smoke Effect Photo Maker - Smoke Editor,ART_AND_DESIGN,178,19M,"50,000+",Free,0,Everyone,Art & Design,"April 26, 2018",1.1,4.0.3 and up,3.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8963,FR Forms,BUSINESS,0,9.6M,10+,Free,0,Everyone,Business,"September 29, 2016",1.1.5,4.0 and up,
8964,Sya9a Maroc - FR,FAMILY,38,53M,"5,000+",Free,0,Everyone,Education,"July 25, 2017",1.48,4.1 and up,4.5
8965,Fr. Mike Schmitz Audio Teachings,FAMILY,4,3.6M,100+,Free,0,Everyone,Education,"July 6, 2018",1.0,4.1 and up,5.0
8966,Parkinson Exercices FR,MEDICAL,3,9.5M,"1,000+",Free,0,Everyone,Medical,"January 20, 2017",1.0,2.2 and up,


## Pipeline

In [None]:
from sklearn.compose import ColumnTransformer
from preprocessing import *

column_transform = ColumnTransformer(
    [
        # ("app_name", preprocess_app_name(), ["app_name"]),
        ("app_category", category_pipeline(), ["app_category"]),
        ("reviews", reviews_numerical_pipeline(), ["reviews_count"]),
        ("size", size_pipeline(), ["size"]),
        ("installs", installs_pipeline(), ["installs_count"]),
        ("free_paid",  type_pipeline(),["free_paid"]),
        ("price", price_pipeline(), ["price_if_paid"]),
        ("age_rating", age_rating_pipeline(), ["age_rating"]),
        # ("app_tags", app_tags_pipeline(), ["app_tags"]),
        ("last_updated", release_date_pipeline(), ["last_updated"]),
        # ("app_version", dropper, ["app_version"]),
        ("os_version", os_version_pipeline(), ["compatible_os_version"]),
    ],
    remainder="drop",
)

In [None]:
column_transform

In [None]:
X = train_df.drop(columns=["Y"])
y = train_df["Y"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,  # 20% held out for testing
    random_state=42  # for reproducibility
)

## Random Forest

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

# 1) Build a Pipeline with your existing preprocessor + RF
rf_pipe = Pipeline([
    ("preprocessing", column_transform),
    ("regressor", RandomForestRegressor(random_state=42, n_jobs=-1))
])

# 2) Sweep over n_estimators exactly as you did
estimators = np.arange(10, 400, 10)
scores = []

for n in estimators:
    rf_pipe.set_params(regressor__n_estimators=n)
    rf_pipe.fit(X_train, y_train)
    scores.append(rf_pipe.score(X_test, y_test))

# 3) Plot the effect
plt.figure(figsize=(7, 5))
plt.plot(estimators, scores, marker="o")
plt.title("Effect of n_estimators on RandomForestRegressor")
plt.xlabel("n_estimators")
plt.ylabel("R² on X_test")
plt.show()

# 4) Inspect the raw numbers
results = list(zip(estimators, scores))
print(results)

### Score on actual test

In [None]:
X_test_set = actual_test_csv.drop(columns=['Y'])
y_test_set = actual_test_csv['Y']

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

# Predict on test set
y_pred = rf_pipe.predict(X_test_set)

mse = mean_squared_error(y_test_set, y_pred)
mae = mean_absolute_error(y_test_set, y_pred)
r2 = r2_score(y_test_set, y_pred)

# Output
print(f"Test MSE: {mse:.4f}")
print(f"Test MAE: {mae:.4f}")
print(f"Test R²: {r2:.4f}")