In [3]:
import pandas as pd
from numpy.ma.core import minimum
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, OrdinalEncoder, StandardScaler

train_df_copy = pd.read_csv("cleaned_train_only_dropped_y.csv")

In [4]:
train_df_copy

Unnamed: 0,app_name,app_category,size_unknown_unit,downloads_unstandardized,reviews_count,free_paid,price_if_paid,age_rating,app_tags,release_date,app_version,compatible_os_version,Y
0,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,4.7
1,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,4.3
2,Paper flowers instructions,ART_AND_DESIGN,167,5.6M,"50,000+",Free,0,Everyone,Art & Design,"March 26, 2017",1.0,2.3 and up,4.4
3,Smoke Effect Photo Maker - Smoke Editor,ART_AND_DESIGN,178,19M,"50,000+",Free,0,Everyone,Art & Design,"April 26, 2018",1.1,4.0.3 and up,3.8
4,Infinite Painter,ART_AND_DESIGN,36815,29M,"1,000,000+",Free,0,Everyone,Art & Design,"June 14, 2018",6.1.61.1,4.2 and up,4.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6704,TED,EDUCATION,181927,18M,"10,000,000+",Free,0,Everyone 10+,Education,"July 27, 2018",3.2.5,4.1 and up,4.6
6705,Choices: Stories You Play,GAME,807246,93M,"10,000,000+",Free,0,Teen,Simulation,"July 30, 2018",2.3.5,4.0.3 and up,4.6
6706,Toca Life: City,EDUCATION,31092,24M,"500,000+",Paid,$3.99,Everyone,Education;Pretend Play,"July 6, 2018",1.5-play,4.4 and up,4.7
6707,UNICORN - Color By Number & Pixel Art Coloring,ART_AND_DESIGN,8204,24M,"500,000+",Free,0,Everyone,Art & Design;Creativity,"August 2, 2018",1.0.9,4.4 and up,4.7


* Maybe try stratified sampling on review groups

In [5]:
import matplotlib.pyplot as plt


def plot_numeric_distribution(df, column_name, bins=50, color='skyblue'):
    plt.figure(figsize=(12, 6))
    plt.hist(df[column_name].dropna(), bins=bins, color=color, edgecolor='black')
    plt.title(f'Distribution of {column_name}', fontsize=14, pad=15)
    plt.xlabel(column_name, fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

In [6]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(train_df_copy, test_size=0.2, random_state=42)

In [7]:
train_set.shape

(5367, 13)

In [8]:
test_set.shape

(1342, 13)

* For imputer try knn and iterative
* Our missing data is MNAR

In [9]:
def parse_number(value):
    """
    Convert strings like '$1.5M', '$600K', '$100,000+' into integers.
    Strips '$', ',', and '+' before processing.
    """
    if isinstance(value, str):
        value = value.strip().upper().replace("$", "").replace(",", "").replace("+", "")
        try:
            if value.endswith("M"):
                return int(float(value[:-1]) * 1_000_000)
            elif value.endswith("K"):
                return int(float(value[:-1]) * 1_000)
            elif value.replace('.', '', 1).isdigit():
                return int(float(value))
        except ValueError:
            return np.nan
    elif isinstance(value, (int, float)):
        return int(value)
    return np.nan

In [10]:
def group_reviews_count(val):
    very_low = ["1+", "5+", "10+", "50+", "100+"]
    low_mid = ["500+", "1,000+", "5,000+", "10,000+", "50,000+"]
    mid = ["100,000+", "500,000+"]
    high = ["1,000,000+", "5,000,000+", "10,000,000+"]
    top = ["50,000,000+", "100,000,000+", "500,000,000+", "1,000,000,000+"]

    if val in very_low:
        return "Very Low"
    elif val in low_mid:
        return "Low-Mid"
    elif val in mid:
        return "Mid"
    elif val in high:
        return "High"
    elif val in top:
        return "Top Tier"
    else:
        return "Other"

In [11]:
def group_holidays(holiday):
    high = [
        "Independence Day", "Veterans Day", "Thanksgiving Day",
        "Memorial Day", "Christmas Day (observed)"
    ]
    mid = [
        "Not Holiday", "Columbus Day", "Martin Luther King Jr. Day",
        "Veterans Day (observed)", "New Year's Day", "Labor Day"
    ]
    low = [
        "Christmas Day", "Washington's Birthday"
    ]

    if holiday in high:
        return "High Rating Holiday"
    elif holiday in mid:
        return "Mid Rating Holiday"
    elif holiday in low:
        return "Low Rating Holiday"
    else:
        return "Other"

In [12]:
def group_years(year):
    if year <= 2012:
        return 'Old'
    elif 2013 <= year <= 2016:
        return 'Middle'
    else:
        return 'Recent'

In [13]:
def combine_everyone_age_rating(X):
    X = X.copy()
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X, columns=["age_rating"])

    X["age_rating"] = X["age_rating"].replace({"Everyone 10+": "Everyone"})
    return X

In [14]:
import holidays

us_holidays = holidays.US(years=range(2010, 2019))
for date, name in sorted(us_holidays.items()):
    print(f"{date}: {name}")

2010-01-01: New Year's Day
2010-01-18: Martin Luther King Jr. Day
2010-02-15: Washington's Birthday
2010-05-31: Memorial Day
2010-07-04: Independence Day
2010-07-05: Independence Day (observed)
2010-09-06: Labor Day
2010-10-11: Columbus Day
2010-11-11: Veterans Day
2010-11-25: Thanksgiving Day
2010-12-24: Christmas Day (observed)
2010-12-25: Christmas Day
2010-12-31: New Year's Day (observed)
2011-01-01: New Year's Day
2011-01-17: Martin Luther King Jr. Day
2011-02-21: Washington's Birthday
2011-05-30: Memorial Day
2011-07-04: Independence Day
2011-09-05: Labor Day
2011-10-10: Columbus Day
2011-11-11: Veterans Day
2011-11-24: Thanksgiving Day
2011-12-25: Christmas Day
2011-12-26: Christmas Day (observed)
2012-01-01: New Year's Day
2012-01-02: New Year's Day (observed)
2012-01-16: Martin Luther King Jr. Day
2012-02-20: Washington's Birthday
2012-05-28: Memorial Day
2012-07-04: Independence Day
2012-09-03: Labor Day
2012-10-08: Columbus Day
2012-11-11: Veterans Day
2012-11-12: Veterans D

In [15]:
import re


import re

def extract_min_base_os(value):
    value = str(value).upper().strip()

    # Match standard version patterns: '5.0 and up', '5.0', '5.0 - 6.0', etc.
    match = re.search(r'(\d+\.\d+)', value)
    if match:
        return float(match.group(1))

    # Special case: Wear OS like '4.4W and up'
    match_wear = re.search(r'(\d+\.\d+)W', value)
    if match_wear:
        return float(match_wear.group(1))

    return 0.0  # Return 0.0 instead of string if format doesn't match


def is_wear_os(cleaned_value):
    return 'W' in cleaned_value


def is_version_range(cleaned_value):
    return '-' in cleaned_value

In [16]:
train_set["release_date"] = pd.to_datetime(train_set["release_date"])

In [17]:
from sklearn.compose import ColumnTransformer
import numpy as np
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import PowerTransformer, FunctionTransformer, OrdinalEncoder, OneHotEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


def box_cox_pipeline():
    return make_pipeline(
        PowerTransformer(method="box-cox", standardize=True))


def category_pipeline():
    return make_pipeline(
        OneHotEncoder()
    )


def downloads_pipeline():
    return make_pipeline(
        FunctionTransformer(lambda X: X.iloc[:, 0].map(parse_number).to_frame(), ),
        IterativeImputer(
            missing_values=np.nan,
            add_indicator=True,
            random_state=42,
        ),
        log_pipeline(),
    )


def ordinal_category_pipeline():
    return make_pipeline(
        OrdinalEncoder()
    )


def reviews_numerical_pipeline():
    return make_pipeline(
        FunctionTransformer(lambda X: X.iloc[:, 0].map(parse_number).to_frame(), ),
        box_cox_pipeline()
    )


def review_group_pipeline():
    return make_pipeline(
        FunctionTransformer(lambda X: X.iloc[:, 0]
                            .map(group_reviews_count)  # apply per value
                            .to_frame(), ),
        category_pipeline()
    )


def reviews_pipeline():
    return FeatureUnion([
        ("reviews_cat", ordinal_category_pipeline()),
        ("reviews_num", reviews_numerical_pipeline()),
        ("reviews_group", review_group_pipeline())
    ])


def price_pipeline():
    return make_pipeline(
        FunctionTransformer(lambda X: X.iloc[:, 0].map(parse_number).to_frame(), ),
        log_pipeline()
    )


def age_rating_pipeline():
    return make_pipeline(
        FunctionTransformer(combine_everyone_age_rating),
        category_pipeline()
    )


def year_group_pipeline():
    return make_pipeline(
        FunctionTransformer(lambda X: pd.DataFrame(X, columns=["release_date"])["release_date"]
                          .dt.year
                          .map(group_years)
                          .to_frame(name="year_group"),),
        category_pipeline()
    )


def holiday_group_pipeline():
    return make_pipeline(
        FunctionTransformer(lambda df: df['release_date'].map(us_holidays).fillna('Not Holiday').to_frame()),
        FunctionTransformer(group_holidays),
    )


def release_date_pipeline():
    return FeatureUnion([
        ("year", FunctionTransformer(lambda df: df["release_date"].dt.year.to_frame())),
        ("year_group", year_group_pipeline()),
        ("holiday_group", holiday_group_pipeline()),
        ("is_holiday", FunctionTransformer(lambda df: df['release_date'].isin(us_holidays).to_frame())),
        ("weekday", FunctionTransformer(lambda df: train_df_copy['release_date'].dt.weekday.to_frame())),
    ])


def os_version_pipeline():
    return make_pipeline(
        FunctionTransformer(extract_min_base_os),
        FunctionTransformer(lambda v: v.astype(float))
    )


def log_pipeline():
    return make_pipeline(FunctionTransformer(np.log1p))


column_transform = ColumnTransformer([
    ("categories", category_pipeline(), ["app_category", "free_paid"]),
    ("boxcox", box_cox_pipeline(), ["size_unknown_unit"]),
    ("downloads", downloads_pipeline(), ["downloads_unstandardized"]),
    ("reviews", reviews_pipeline(), ["reviews_count"]),
    ("price", price_pipeline(), ["price_if_paid"]),
    ("age_rating", age_rating_pipeline(), ["age_rating"]),
    ("dates", release_date_pipeline(), ["release_date"]),
    ("os", os_version_pipeline(), ["compatible_os_version"]),
],
    remainder='drop')

In [18]:
column_transform

* Currently there's still errors in mapping and numpy in the pipeline

In [122]:
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, SelectFromModel, RFE
from sklearn import set_config
df = train_set.copy()
X = df.drop(columns=['Y'])
y = df['Y']

# Apply your existing ColumnTransformer
column_transform.fit(X)
X_prepared = column_transform.transform(X)
# feature_names = column_transform.get_feature_names_out()
feature_names = X_prepared.columns

# Define different selectors
selectors = {
    'SelectKBest(F-test)': SelectKBest(score_func=f_regression, k=20),
    'SelectKBest(MI)': SelectKBest(score_func=mutual_info_regression, k=20),
    'SelectFromModel(LassoCV)': SelectFromModel(LassoCV(cv=5), threshold='median'),
    'RFE(RandomForest)': RFE(estimator=RandomForestRegressor(n_estimators=50, random_state=42), n_features_to_select=20)
}

# Collect selected feature names
selected_features = {}
for name, selector in selectors.items():
    selector.fit(X_prepared, y)
    mask = selector.get_support()
    selected_features[name] = list(np.array(feature_names)[mask])

# Create DataFrame for display
max_len = max(len(v) for v in selected_features.values())
for k, v in selected_features.items():
    selected_features[k] = v + [''] * (max_len - len(v))
df_selected = pd.DataFrame(selected_features)

df_selected

ValueError: Input X contains NaN.
SelectKBest does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [123]:
from sklearn.utils import estimator_html_repr
from scipy.sparse import issparse

# Transform X
X_prepared = column_transform.transform(X)

# If sparse, convert to dense before checking for NaNs
if issparse(X_prepared):
    X_prepared = X_prepared.toarray()

# Now check for NaNs
import numpy as np
print("Any NaNs:", np.isnan(X_prepared).any())
print("NaN count:", np.isnan(X_prepared).sum())

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''