In [409]:
# Data
import numpy as np
import pandas as pd

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import MinMaxScaler
import joblib

import warnings
warnings.filterwarnings('ignore')


In [410]:
df = pd.read_csv("data/test.csv")
df.sample(5)

Unnamed: 0,App Name,Category,Reviews,Size,Installs,Type,Price,Content Rating,Genre,Last Updated,Current Version,Android Version
1737,BZ Reminder PRO,BUSINESS,726,5.4M,"1,000+",Paid,$3.99,Everyone,Business,"December 20, 2017",2.3.3,4.1 and up
691,Barbie™ Fashion Closet,FAMILY,68286,85M,"10,000,000+",Free,0,Everyone,Casual;Creativity,"July 30, 2018",1.3.7,4.1 and up
1220,Recognise Foot,MEDICAL,9,95M,"1,000+",Paid,$7.49,Everyone,Medical,"September 14, 2017",1.0.3,4.1 and up
594,Diabetes:M,MEDICAL,15545,Varies with device,"100,000+",Free,0,Everyone,Medical,"August 1, 2018",6.1.3,Varies with device
755,VH1,ENTERTAINMENT,27424,17M,"1,000,000+",Free,0,Teen,Entertainment,"July 8, 2018",11.45.0,4.4 and up


Reviews

Size

In [411]:
df['Installs'] = df['Installs'].str.replace('+', '', regex=False)
df['Installs'] = df['Installs'].str.replace(',', '', regex=False)


In [412]:
df['Price'] = df['Price'].str.replace('$', '', regex=False)


In [413]:
df['Installs'] = pd.to_numeric(df['Installs'])


In [414]:
df['Price'] = pd.to_numeric(df['Price'])


last updated

In [415]:
# Convert to datetime format
df['Last Updated'] = pd.to_datetime(df['Last Updated'])

# Feature 1: Days since last update (relative to the most recent date)
most_recent_date = df['Last Updated'].max()
df['Days_Since_Update'] = (most_recent_date - df['Last Updated']).dt.days

In [416]:
df = df.drop('Last Updated', axis=1)


In [417]:
df['Is_Varies_Android'] = df['Android Version'] == 'Varies with device'
df['Is_Varies_Current'] = df['Current Version'] == 'Varies with device'


they are significant

In [418]:
def version_to_float(version):
    if pd.isna(version):
        return np.nan
    parts = version.strip().split('.')
    parts = [int(p) if p.isdigit() else 0 for p in parts]
    while len(parts) < 4:
        parts.append(0)
    return float(f"{parts[0]}.{parts[1]:02d}{parts[2]:02d}{parts[3]:02d}")

df['Current Version Num'] = df['Current Version'].apply(version_to_float)
df['Android Version Clean'] = df['Android Version'].str.replace('and up', '', regex=False).str.strip()
df['Android Version Num'] = df['Android Version Clean'].apply(version_to_float)



In [419]:
df.loc[df['Current Version'] == 'Varies with device', 'Current Version Num'] = -1
df['Android Version Num'].replace(-1, np.nan, inplace=True)


mean_val = df['Android Version Num'].mean()
df['Android Version Num'].fillna(mean_val, inplace=True)

df['Current Version Num'].replace(-1, np.nan, inplace=True)


mean_val = df['Current Version Num'].mean()
df['Current Version Num'].fillna(mean_val, inplace=True)


In [420]:
df[['Current Version', 'Current Version Num']]

Unnamed: 0,Current Version,Current Version Num
0,1.9.2,1.090200
1,2.1.0,2.010000
2,1.1,1.010000
3,1.9,1.090000
4,2.0.10,2.001000
...,...,...
1868,2.10.3,2.100300
1869,1.0,1.000000
1870,Varies with device,12775.942475
1871,6.3.2,6.030200


In [421]:
df['Current Version Num'].fillna(df['Current Version Num'].median(), inplace=True)
df['Android Version Num'].fillna(df['Android Version Num'].median(), inplace=True)


In [422]:
df[['Current Version', 'Current Version Num']]

Unnamed: 0,Current Version,Current Version Num
0,1.9.2,1.090200
1,2.1.0,2.010000
2,1.1,1.010000
3,1.9,1.090000
4,2.0.10,2.001000
...,...,...
1868,2.10.3,2.100300
1869,1.0,1.000000
1870,Varies with device,12775.942475
1871,6.3.2,6.030200


In [423]:
df[['Android Version', 'Android Version Num']]

Unnamed: 0,Android Version,Android Version Num
0,4.0.3 and up,4.0003
1,4.1 and up,4.0100
2,4.0.3 and up,4.0003
3,2.3 and up,2.0300
4,4.0.3 and up,4.0003
...,...,...
1868,2.3.3 and up,2.0303
1869,2.0 and up,2.0000
1870,Varies with device,0.0000
1871,Varies with device,0.0000


In [424]:
df.isnull().sum()

App Name                 0
Category                 0
Reviews                  0
Size                     0
Installs                 0
Type                     0
Price                    0
Content Rating           0
Genre                    0
Current Version          1
Android Version          0
Days_Since_Update        0
Is_Varies_Android        0
Is_Varies_Current        0
Current Version Num      0
Android Version Clean    0
Android Version Num      0
dtype: int64

In [425]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1873 entries, 0 to 1872
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   App Name               1873 non-null   object 
 1   Category               1873 non-null   object 
 2   Reviews                1873 non-null   int64  
 3   Size                   1873 non-null   object 
 4   Installs               1873 non-null   int64  
 5   Type                   1873 non-null   object 
 6   Price                  1873 non-null   float64
 7   Content Rating         1873 non-null   object 
 8   Genre                  1873 non-null   object 
 9   Current Version        1872 non-null   object 
 10  Android Version        1873 non-null   object 
 11  Days_Since_Update      1873 non-null   int64  
 12  Is_Varies_Android      1873 non-null   bool   
 13  Is_Varies_Current      1873 non-null   bool   
 14  Current Version Num    1873 non-null   float64
 15  Andr

In [426]:
df.head()

Unnamed: 0,App Name,Category,Reviews,Size,Installs,Type,Price,Content Rating,Genre,Current Version,Android Version,Days_Since_Update,Is_Varies_Android,Is_Varies_Current,Current Version Num,Android Version Clean,Android Version Num
0,Girls hairstyles 2018,BEAUTY,62,3.1M,10000,Free,0.0,Everyone,Beauty,1.9.2,4.0.3 and up,71,False,False,1.0902,4.0.3,4.0003
1,Dairy Queen,FOOD_AND_DRINK,742,43M,100000,Free,0.0,Everyone,Food & Drink,2.1.0,4.1 and up,13,False,False,2.01,4.1,4.01
2,Remote Control For All AC - Universal Remote,PRODUCTIVITY,166,6.1M,10000,Free,0.0,Everyone,Productivity,1.1,4.0.3 and up,10,False,False,1.01,4.0.3,4.0003
3,Ultimate Chest Tracker,PRODUCTIVITY,40328,23M,1000000,Free,0.0,Everyone,Productivity,1.9,2.3 and up,659,False,False,1.09,2.3,2.03
4,Mobilight-BM,FINANCE,6,6.2M,500,Free,0.0,Everyone,Finance,2.0.10,4.0.3 and up,4,False,False,2.001,4.0.3,4.0003


Type

In [427]:
df['Type'].unique()

array(['Free', 'Paid'], dtype=object)

In [428]:
# Check if all Free apps have Price = 0
free_check = df[df['Type'] == 'Free']['Price'].eq(0.0).all()

# Check if all Paid apps have Price > 0
paid_check = df[df['Type'] == 'Paid']['Price'].gt(0.0).all()

print("Free apps have Price 0:", free_check)
print("Paid apps have Price > 0:", paid_check)


Free apps have Price 0: True
Paid apps have Price > 0: True


In [429]:
df.drop(columns=['Type'], inplace=True)


Category

In [430]:
df['genre_list'] = df['Genre'].apply(lambda x: x.split(';'))
df['genre_list']

0              [Beauty]
1        [Food & Drink]
2        [Productivity]
3        [Productivity]
4             [Finance]
             ...       
1868          [Finance]
1869            [Tools]
1870           [Sports]
1871    [Communication]
1872           [Action]
Name: genre_list, Length: 1873, dtype: object

In [431]:

mlb = joblib.load('mlb.pkl')
# Transform the new genres using the already fitted mlb
new_genre_encoded = mlb.transform(df['genre_list'])

# Create a DataFrame from the encoded genre data
new_genre_encoded_df = pd.DataFrame(new_genre_encoded, columns=mlb.classes_, index=df.index)

# Concatenate with the new_df (drop genre_list if needed)
df = pd.concat([df, new_genre_encoded_df], axis=1)
df.drop(columns='genre_list', inplace=True)  # optional

In [432]:
df

Unnamed: 0,App Name,Category,Reviews,Size,Installs,Price,Content Rating,Genre,Current Version,Android Version,...,Simulation,Social,Sports,Strategy,Tools,Travel & Local,Trivia,Video Players & Editors,Weather,Word
0,Girls hairstyles 2018,BEAUTY,62,3.1M,10000,0.0,Everyone,Beauty,1.9.2,4.0.3 and up,...,0,0,0,0,0,0,0,0,0,0
1,Dairy Queen,FOOD_AND_DRINK,742,43M,100000,0.0,Everyone,Food & Drink,2.1.0,4.1 and up,...,0,0,0,0,0,0,0,0,0,0
2,Remote Control For All AC - Universal Remote,PRODUCTIVITY,166,6.1M,10000,0.0,Everyone,Productivity,1.1,4.0.3 and up,...,0,0,0,0,0,0,0,0,0,0
3,Ultimate Chest Tracker,PRODUCTIVITY,40328,23M,1000000,0.0,Everyone,Productivity,1.9,2.3 and up,...,0,0,0,0,0,0,0,0,0,0
4,Mobilight-BM,FINANCE,6,6.2M,500,0.0,Everyone,Finance,2.0.10,4.0.3 and up,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1868,FREE Stock Market Trading Tips,FINANCE,714,3.6M,50000,0.0,Everyone,Finance,2.10.3,2.3.3 and up,...,0,0,0,0,0,0,0,0,0,0
1869,Next Portuguese(BR) Langpack,TOOLS,1320,778k,100000,0.0,Everyone,Tools,1.0,2.0 and up,...,0,0,0,0,1,0,0,0,0,0
1870,850 Sports News Digest,SPORTS,539,Varies with device,10000,0.0,Everyone,Sports,Varies with device,Varies with device,...,0,0,1,0,0,0,0,0,0,0
1871,Lite for Facebook Messenger,COMMUNICATION,76498,4.3M,1000000,0.0,Teen,Communication,6.3.2,Varies with device,...,0,0,0,0,0,0,0,0,0,0


In [433]:
df['Category'].unique()

array(['BEAUTY', 'FOOD_AND_DRINK', 'PRODUCTIVITY', 'FINANCE', 'FAMILY',
       'LIFESTYLE', 'VIDEO_PLAYERS', 'SHOPPING', 'GAME', 'TOOLS',
       'COMMUNICATION', 'EDUCATION', 'BUSINESS', 'COMICS', 'DATING',
       'BOOKS_AND_REFERENCE', 'NEWS_AND_MAGAZINES', 'PARENTING',
       'PERSONALIZATION', 'MEDICAL', 'HEALTH_AND_FITNESS',
       'TRAVEL_AND_LOCAL', 'PHOTOGRAPHY', 'SPORTS', 'LIBRARIES_AND_DEMO',
       'HOUSE_AND_HOME', 'SOCIAL', 'WEATHER', 'EVENTS',
       'AUTO_AND_VEHICLES', 'ENTERTAINMENT', 'ART_AND_DESIGN',
       'MAPS_AND_NAVIGATION'], dtype=object)

In [434]:
categories = ['ART_AND_DESIGN', 'AUTO_AND_VEHICLES', 'BEAUTY',
              'BOOKS_AND_REFERENCE', 'BUSINESS', 'COMICS', 'COMMUNICATION',
              'DATING', 'EDUCATION', 'ENTERTAINMENT', 'EVENTS', 'FINANCE',
              'FOOD_AND_DRINK', 'HEALTH_AND_FITNESS', 'HOUSE_AND_HOME',
              'LIBRARIES_AND_DEMO', 'LIFESTYLE', 'GAME', 'FAMILY', 'MEDICAL',
              'SOCIAL', 'SHOPPING', 'PHOTOGRAPHY', 'SPORTS', 'TRAVEL_AND_LOCAL',
              'TOOLS', 'PERSONALIZATION', 'PRODUCTIVITY', 'PARENTING', 'WEATHER',
              'VIDEO_PLAYERS', 'NEWS_AND_MAGAZINES', 'MAPS_AND_NAVIGATION']


category_map = {cat: i for i, cat in enumerate(categories)}


df['Category_Encoded'] = df['Category'].map(category_map)

In [435]:
df['Cat_Categorical'] = df['Category']

df = pd.get_dummies(df, columns=['Category'], prefix='One_Hot_Cat')

In [436]:
df.head(100)

Unnamed: 0,App Name,Reviews,Size,Installs,Price,Content Rating,Genre,Current Version,Android Version,Days_Since_Update,...,One_Hot_Cat_PERSONALIZATION,One_Hot_Cat_PHOTOGRAPHY,One_Hot_Cat_PRODUCTIVITY,One_Hot_Cat_SHOPPING,One_Hot_Cat_SOCIAL,One_Hot_Cat_SPORTS,One_Hot_Cat_TOOLS,One_Hot_Cat_TRAVEL_AND_LOCAL,One_Hot_Cat_VIDEO_PLAYERS,One_Hot_Cat_WEATHER
0,Girls hairstyles 2018,62,3.1M,10000,0.0,Everyone,Beauty,1.9.2,4.0.3 and up,71,...,False,False,False,False,False,False,False,False,False,False
1,Dairy Queen,742,43M,100000,0.0,Everyone,Food & Drink,2.1.0,4.1 and up,13,...,False,False,False,False,False,False,False,False,False,False
2,Remote Control For All AC - Universal Remote,166,6.1M,10000,0.0,Everyone,Productivity,1.1,4.0.3 and up,10,...,False,False,True,False,False,False,False,False,False,False
3,Ultimate Chest Tracker,40328,23M,1000000,0.0,Everyone,Productivity,1.9,2.3 and up,659,...,False,False,True,False,False,False,False,False,False,False
4,Mobilight-BM,6,6.2M,500,0.0,Everyone,Finance,2.0.10,4.0.3 and up,4,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Google Analytics,78662,22M,1000000,0.0,Everyone,Business,3.7.5,4.4 and up,175,...,False,False,False,False,False,False,False,False,False,False
96,Forgotten Hill: Surgery,2431,24M,100000,0.0,Teen,Adventure,1.4,3.0 and up,277,...,False,False,False,False,False,False,False,False,False,False
97,Weather BZ,8433,5.6M,100000,0.0,Everyone,Weather,5.0.1 build 1,4.0 and up,62,...,False,False,False,False,False,False,False,False,False,True
98,Car Parking Crane N Drifting,106,71M,50000,0.0,Everyone,Simulation,1.8,4.0.3 and up,162,...,False,False,False,False,False,False,False,False,False,False


In [437]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1873 entries, 0 to 1872
Columns: 103 entries, App Name to One_Hot_Cat_WEATHER
dtypes: bool(35), float64(3), int64(57), object(8)
memory usage: 1.0+ MB


In [438]:
for col in df.columns:
    if col.startswith('One_Hot_Cat') or col.startswith('Is_Varies'):
        df[col] = df[col].astype(int)

In [439]:
df.drop(columns=['Current Version','Android Version','Android Version Clean'], inplace=True)

In [440]:
df

Unnamed: 0,App Name,Reviews,Size,Installs,Price,Content Rating,Genre,Days_Since_Update,Is_Varies_Android,Is_Varies_Current,...,One_Hot_Cat_PERSONALIZATION,One_Hot_Cat_PHOTOGRAPHY,One_Hot_Cat_PRODUCTIVITY,One_Hot_Cat_SHOPPING,One_Hot_Cat_SOCIAL,One_Hot_Cat_SPORTS,One_Hot_Cat_TOOLS,One_Hot_Cat_TRAVEL_AND_LOCAL,One_Hot_Cat_VIDEO_PLAYERS,One_Hot_Cat_WEATHER
0,Girls hairstyles 2018,62,3.1M,10000,0.0,Everyone,Beauty,71,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Dairy Queen,742,43M,100000,0.0,Everyone,Food & Drink,13,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Remote Control For All AC - Universal Remote,166,6.1M,10000,0.0,Everyone,Productivity,10,0,0,...,0,0,1,0,0,0,0,0,0,0
3,Ultimate Chest Tracker,40328,23M,1000000,0.0,Everyone,Productivity,659,0,0,...,0,0,1,0,0,0,0,0,0,0
4,Mobilight-BM,6,6.2M,500,0.0,Everyone,Finance,4,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1868,FREE Stock Market Trading Tips,714,3.6M,50000,0.0,Everyone,Finance,426,0,0,...,0,0,0,0,0,0,0,0,0,0
1869,Next Portuguese(BR) Langpack,1320,778k,100000,0.0,Everyone,Tools,1789,0,0,...,0,0,0,0,0,0,1,0,0,0
1870,850 Sports News Digest,539,Varies with device,10000,0.0,Everyone,Sports,714,1,1,...,0,0,0,0,0,1,0,0,0,0
1871,Lite for Facebook Messenger,76498,4.3M,1000000,0.0,Teen,Communication,48,1,0,...,0,0,0,0,0,0,0,0,0,0


In [441]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1873 entries, 0 to 1872
Data columns (total 100 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   App Name                         1873 non-null   object 
 1   Reviews                          1873 non-null   int64  
 2   Size                             1873 non-null   object 
 3   Installs                         1873 non-null   int64  
 4   Price                            1873 non-null   float64
 5   Content Rating                   1873 non-null   object 
 6   Genre                            1873 non-null   object 
 7   Days_Since_Update                1873 non-null   int64  
 8   Is_Varies_Android                1873 non-null   int64  
 9   Is_Varies_Current                1873 non-null   int64  
 10  Current Version Num              1873 non-null   float64
 11  Android Version Num              1873 non-null   float64
 12  Action             

In [442]:
df['Reviews'].info()
df['Reviews'] = df['Reviews'].astype(int)


<class 'pandas.core.series.Series'>
RangeIndex: 1873 entries, 0 to 1872
Series name: Reviews
Non-Null Count  Dtype
--------------  -----
1873 non-null   int64
dtypes: int64(1)
memory usage: 14.8 KB


In [443]:
df['Reviews_log'] = np.log1p(df['Reviews'])
df['Installs_log'] = np.log1p(df['Installs'])
df['Price_log'] = np.log1p(df['Price'])
df['Days_Since_Update_log'] = np.log1p(df['Days_Since_Update'])


In [444]:
def convert_size(size):
    if 'M' in size:
        return float(size.replace('M', '')) * 1_000  # MB to KB
    elif 'k' in size:
        return float(size.replace('k', ''))


df['Size'] = df['Size'].apply(convert_size)
df['Size_log'] = np.log(df['Size'])

In [445]:
df['Is_Free'] = (df['Price'] == 0).astype(int)

In [446]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1873 entries, 0 to 1872
Columns: 106 entries, App Name to Is_Free
dtypes: float64(9), int64(93), object(4)
memory usage: 1.5+ MB


In [447]:
df['Content Rating'].unique()

array(['Everyone', 'Teen', 'Mature 17+', 'Everyone 10+'], dtype=object)

In [448]:
rating_order = {
    'Everyone': 0,
    'Everyone 10+': 1,
    'Teen': 2,
    'Mature 17+': 3,
    'Adults only 18+': 4,
    'Unrated': -1  # or 5, depending on how you want to treat "Unrated"
}

In [449]:
df['content_rating_ordinal'] = df['Content Rating'].map(rating_order)

In [450]:
df['content_rating_Categorical'] = df['Content Rating']

In [451]:
df = pd.get_dummies(df, columns=['Content Rating'], prefix='rating')

In [452]:
df_temp = df.copy()
df_temp.drop(columns=['App Name', 'content_rating_Categorical', 'Cat_Categorical','Genre'], inplace=True)

In [453]:
columns_to_use = ['Reviews', 'Size', 'Installs', 'Price']

In [454]:


# Normalize the columns using MinMaxScaler
#scaler = joblib.load('scaler.pkl')
#df[columns_to_use] = scaler.transform(df[columns_to_use])


In [455]:

target               = ['Y']

numeric_raw          = ['Reviews', 'Size', 'Installs', 'Price',
                        'Days_Since_Update', 'Current Version Num',
                        'Android Version Num', 'Is_Varies_Android', 'Is_Free']

numeric_log          = ['Reviews_log', 'Price_log',
                        'Days_Since_Update_log', 'Size_log']

genre_onehot         = ['Action', 'Action & Adventure', 'Adventure', 'Arcade',
                        'Art & Design', 'Auto & Vehicles', 'Beauty', 'Board',
                        'Books & Reference', 'Brain Games', 'Business', 'Card',
                        'Casino', 'Casual', 'Comics', 'Communication',
                        'Creativity', 'Dating', 'Education', 'Educational',
                        'Entertainment', 'Events', 'Finance', 'Food & Drink',
                        'Health & Fitness', 'House & Home',
                        'Libraries & Demo', 'Lifestyle', 'Maps & Navigation',
                        'Medical', 'Music', 'Music & Audio', 'Music & Video',
                        'News & Magazines', 'Parenting', 'Personalization',
                        'Photography', 'Pretend Play', 'Productivity', 'Puzzle',
                        'Racing', 'Role Playing', 'Shopping', 'Simulation',
                        'Social', 'Sports', 'Strategy', 'Tools',
                        'Travel & Local', 'Trivia',
                        'Video Players & Editors', 'Weather', 'Word']

category_ordinal     = ['Category_Encoded']
category_label       = ['Cat_Categorical']
category_onehot      = ['One_Hot_Cat_EDUCATION', 'One_Hot_Cat_ENTERTAINMENT',
                        'One_Hot_Cat_FAMILY', 'One_Hot_Cat_GAME']

content_rating_ord   = ['content_rating_ordinal']
content_rating_lbl   = ['content_rating_Categorical']
content_rating_one   = [ 'rating_Everyone',
                        'rating_Everyone 10+', 'rating_Mature 17+',
                        'rating_Teen']




In [456]:
features_num_raw     = numeric_raw                      # 9 vars
features_num_log     = numeric_log                      # 4 vars
features_num_both    = numeric_raw + numeric_log        # 13 vars


In [457]:
features_ord_minimal = numeric_raw + category_ordinal + content_rating_ord
features_ord_full    = numeric_raw + numeric_log + category_ordinal + content_rating_ord
features_ord         = numeric_log + category_ordinal + content_rating_ord
features_cat1h       = numeric_raw + category_onehot
features_genre1h     = numeric_raw + genre_onehot
features_full1h      = numeric_raw + numeric_log + genre_onehot + category_onehot + content_rating_one
features_balanced    = numeric_raw + numeric_log + genre_onehot + category_ordinal + content_rating_ord
features_balanced2    = numeric_raw + category_ordinal + content_rating_ord + genre_onehot


In [458]:
model = joblib.load('balanced2RF_700_depthNone_frac.pkl')

In [459]:


x = df[features_balanced2]

In [460]:
y_pred = model.predict(x)
row_ids = x.index  # or any saved 'row_id' column

submission_df = pd.DataFrame({
    'row_id': row_ids,
    'Y': y_pred
})

submission_df.to_csv('predictions.csv', index=False)

ValueError: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [355]:

# Load your two CSV files
df_y = pd.read_csv('predictions.csv')      # This has the Y values only
df_ids = pd.read_csv('data/SampleSubmission.csv')    # This has the row_id values only

# Combine them by index (ignore their internal row_id/Y columns)
combined = pd.DataFrame({
    'row_id': df_ids['row_id'],
    'Y': df_y['Y']
})

# Save result if needed
combined.to_csv('data/combined_output.csv', index=False)

print(combined.head())


   row_id         Y
0    2933  4.121782
1     734  4.174946
2    1139  4.041568
3    9394  4.131872
4    7393  4.382838
