In [2]:
# Cell 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
import xgboost as xgb
import optuna


In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Load datasets with raw string paths
train = pd.read_csv(r'C:\Users\yhado\associationsltd.co.uk\OneDrive - associationsltd.co.uk\ML-AI\Final project\event-recommendation-engine-challenge\train.csv')
test = pd.read_csv(r'C:\Users\yhado\associationsltd.co.uk\OneDrive - associationsltd.co.uk\ML-AI\Final project\event-recommendation-engine-challenge\test.csv')
users = pd.read_csv(r'C:\Users\yhado\associationsltd.co.uk\OneDrive - associationsltd.co.uk\ML-AI\Final project\event-recommendation-engine-challenge\users.csv')
user_friends = pd.read_csv(r'C:\Users\yhado\associationsltd.co.uk\OneDrive - associationsltd.co.uk\ML-AI\Final project\event-recommendation-engine-challenge\user_friends.csv.gz', compression='gzip')
events = pd.read_csv(r'C:\Users\yhado\associationsltd.co.uk\OneDrive - associationsltd.co.uk\ML-AI\Final project\event-recommendation-engine-challenge\events.csv.gz', compression='gzip')
event_attendees = pd.read_csv(r'C:\Users\yhado\associationsltd.co.uk\OneDrive - associationsltd.co.uk\ML-AI\Final project\event-recommendation-engine-challenge\event_attendees.csv.gz', compression='gzip')


In [4]:
# Verify that data is loaded correctly
print("Train dataset:")
print(train.head())

print("Test dataset:")
print(test.head())

print("Users dataset:")
print(users.head())

print("User Friends dataset:")
print(user_friends.head())

print("Events dataset:")
print(events.head())

print("Event Attendees dataset:")
print(event_attendees.head())


Train dataset:
      user       event  invited                         timestamp  interested  \
0  3044012  1918771225        0  2012-10-02 15:53:05.754000+00:00           0   
1  3044012  1502284248        0  2012-10-02 15:53:05.754000+00:00           0   
2  3044012  2529072432        0  2012-10-02 15:53:05.754000+00:00           1   
3  3044012  3072478280        0  2012-10-02 15:53:05.754000+00:00           0   
4  3044012  1390707377        0  2012-10-02 15:53:05.754000+00:00           0   

   not_interested  
0               0  
1               0  
2               0  
3               0  
4               0  
Test dataset:
      user       event  invited                         timestamp
0  1776192  2877501688        0  2012-11-30 11:39:01.230000+00:00
1  1776192  3025444328        0  2012-11-30 11:39:01.230000+00:00
2  1776192  4078218285        0  2012-11-30 11:39:01.230000+00:00
3  1776192  1024025121        0  2012-11-30 11:39:01.230000+00:00
4  1776192  2972428928        0  2

In [13]:
# Preprocess the data
# Merge train with events to get start_time from the events dataset
train_merged = pd.merge(train, events[['event_id', 'start_time']], left_on='event', right_on='event_id', how='left')

# Merge the result with users data to include user demographic information
train_merged = pd.merge(train_merged, users, left_on='user', right_on='user_id', how='left')

# Handle missing datetime values in 'start_time' by filling with NaT (Not a Time)
train_merged['start_time'] = pd.to_datetime(train_merged['start_time'], errors='coerce')
train_merged['start_time'] = train_merged['start_time'].fillna(pd.NaT)

# Separate datetime columns from non-datetime columns
# Fill NaN for datetime columns with NaT and for other columns with 0
for col in train_merged.columns:
    if pd.api.types.is_datetime64_any_dtype(train_merged[col]):
        train_merged[col] = train_merged[col].fillna(pd.NaT)
    else:
        train_merged[col] = train_merged[col].fillna(0)

# Drop unnecessary columns, if needed
train_merged.drop(columns=['event_id', 'joinedAt', 'locale'], inplace=True)

# Preview the merged and cleaned dataset
print(train_merged.head())


      user       event  invited                        timestamp  interested  \
0  3044012  1918771225        0 2012-10-02 15:53:05.754000+00:00           0   
1  3044012  1502284248        0 2012-10-02 15:53:05.754000+00:00           0   
2  3044012  2529072432        0 2012-10-02 15:53:05.754000+00:00           1   
3  3044012  3072478280        0 2012-10-02 15:53:05.754000+00:00           0   
4  3044012  1390707377        0 2012-10-02 15:53:05.754000+00:00           0   

   not_interested                       start_time  user_id birthyear gender  \
0               0 2012-10-03 08:00:00.002000+00:00  3044012      1990   male   
1               0 2012-10-03 11:00:00.003000+00:00  3044012      1990   male   
2               0 2012-10-26 13:30:00.003000+00:00  3044012      1990   male   
3               0 2012-10-06 05:00:00.003000+00:00  3044012      1990   male   
4               0 2012-10-06 03:00:00.003000+00:00  3044012      1990   male   

  location  timezone  
0   Binjai     

In [15]:
# Cell 4: Feature engineering
# Create features for the event interactions (attendees, friends, etc.)

# First, fill missing values (NaN) with an empty string
event_attendees['yes'] = event_attendees['yes'].fillna('')
event_attendees['no'] = event_attendees['no'].fillna('')
event_attendees['maybe'] = event_attendees['maybe'].fillna('')

# Then, create new columns with the count of attendees
event_attendees['yes_count'] = event_attendees['yes'].str.split().apply(len)
event_attendees['no_count'] = event_attendees['no'].str.split().apply(len)
event_attendees['maybe_count'] = event_attendees['maybe'].str.split().apply(len)

# Preview the resulting dataframe
print(event_attendees[['yes_count', 'no_count', 'maybe_count']].head())


   yes_count  no_count  maybe_count
0          7         2            7
1         11         0            8
2          0         2            2
3          0         0            0
4          6         1            6


In [27]:
# Cell 5: Prepare training and test sets

# Check available columns in train_merged
print("Columns in train_merged:", train_merged.columns)

# If 'birthyear' is present, convert it to numeric and fill missing values
if 'birthyear' in train_merged.columns:
    train_merged['birthyear'] = pd.to_numeric(train_merged['birthyear'], errors='coerce')
    # Use assignment instead of inplace to avoid the warning
    train_merged['birthyear'] = train_merged['birthyear'].fillna(train_merged['birthyear'].mean())

# Check if 'gender' and 'location' exist before one-hot encoding
categorical_columns = []
if 'gender' in train_merged.columns:
    categorical_columns.append('gender')
if 'location' in train_merged.columns:
    categorical_columns.append('location')

# One-hot encode only if the columns exist
if categorical_columns:
    train_merged = pd.get_dummies(train_merged, columns=categorical_columns, drop_first=True)

# Prepare feature matrix X and target y
X = train_merged.drop(columns=['interested', 'not_interested', 'user', 'event'])
y = train_merged['interested']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Preview the training data
print(X_train.head())


Columns in train_merged: Index(['user', 'event', 'invited', 'interested', 'not_interested', 'user_id',
       'birthyear', 'timezone', 'timestamp_year', 'timestamp_month',
       ...
       'location_Westminster  California', 'location_Whittier  California',
       'location_Wonogiri  Jawa Tengah  Indonesia',
       'location_Wonosobo  Jawa Tengah  Indonesia',
       'location_Woodland Hills  California', 'location_Yogyakarta',
       'location_Yogyakarta  10', 'location_Yonkers  New York',
       'location_Zhonghe  Heilongjiang  China',
       'location_undefined  undefined'],
      dtype='object', length=408)
       invited     user_id  birthyear  timezone  timestamp_year  \
424          0    97374877       1991    -480.0          2012.0   
554          0   147852768       1997     480.0          2012.0   
2992         0   810608905       1992    -300.0          2012.0   
1954         0   516110926       1987     -60.0          2012.0   
13884        0  3925613292       1993     420.

In [37]:
# Train the XGBoost model with the best hyperparameters from Optuna
import xgboost as xgb
import optuna

# Assuming you have the processed X_train, y_train from earlier
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Optuna optimization
def objective(trial):
    param = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'booster': 'gbtree',
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5)
    }

    model = xgb.XGBClassifier(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    return average_precision_score(y_val, y_pred)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Train model with best parameters
model = xgb.XGBClassifier(**study.best_params)
model.fit(X_train, y_train)


[I 2024-09-12 11:41:54,782] A new study created in memory with name: no-name-6999b5b3-2b82-4321-96f6-aa00e20b1408
[I 2024-09-12 11:41:56,697] Trial 0 finished with value: 0.40538528178906114 and parameters: {'max_depth': 8, 'learning_rate': 0.0011907474514665963, 'n_estimators': 313, 'subsample': 0.5005827333260404, 'colsample_bytree': 0.9427578487569652, 'gamma': 3.0010057488108135}. Best is trial 0 with value: 0.40538528178906114.
[I 2024-09-12 11:41:58,216] Trial 1 finished with value: 0.4176090390151886 and parameters: {'max_depth': 10, 'learning_rate': 0.0038933061491696536, 'n_estimators': 220, 'subsample': 0.5289643168642927, 'colsample_bytree': 0.655233532439276, 'gamma': 3.279097987930151}. Best is trial 1 with value: 0.4176090390151886.
[I 2024-09-12 11:41:59,280] Trial 2 finished with value: 0.37786188924549824 and parameters: {'max_depth': 6, 'learning_rate': 0.0010062976521747345, 'n_estimators': 159, 'subsample': 0.9811536562161998, 'colsample_bytree': 0.9224363189120395,

In [39]:
# Preprocess X_test before making predictions

# Convert 'timestamp' and 'joinedAt' to datetime, then extract useful features
if 'timestamp' in test_merged.columns:
    test_merged['timestamp'] = pd.to_datetime(test_merged['timestamp'], errors='coerce')
    test_merged['timestamp_year'] = test_merged['timestamp'].dt.year
    test_merged['timestamp_month'] = test_merged['timestamp'].dt.month
    test_merged['timestamp_day'] = test_merged['timestamp'].dt.day

if 'joinedAt' in test_merged.columns:
    test_merged['joinedAt'] = pd.to_datetime(test_merged['joinedAt'], errors='coerce')
    test_merged['joinedAt_year'] = test_merged['joinedAt'].dt.year
    test_merged['joinedAt_month'] = test_merged['joinedAt'].dt.month
    test_merged['joinedAt_day'] = test_merged['joinedAt'].dt.day

# Drop the original datetime columns since we now have numeric features
test_merged = test_merged.drop(columns=['timestamp', 'joinedAt'])

# Convert 'birthyear' to numeric and fill missing values
if 'birthyear' in test_merged.columns:
    test_merged['birthyear'] = pd.to_numeric(test_merged['birthyear'], errors='coerce')
    test_merged['birthyear'].fillna(test_merged['birthyear'].mean(), inplace=True)

# One-hot encode categorical columns like 'locale', 'gender', and 'location'
categorical_columns = ['locale', 'gender', 'location']
test_merged = pd.get_dummies(test_merged, columns=categorical_columns, drop_first=True)

# Prepare X_test for prediction
X_test = test_merged.drop(columns=['user', 'event'])  # Make sure you're dropping non-feature columns like 'user' and 'event'

# Now you can make predictions
y_test_pred = model.predict_proba(X_test)[:, 1]

# Output predictions (ranking of events by likelihood of user interest)
test_merged['predicted_interest'] = y_test_pred
recommendations = test_merged[['user', 'event', 'predicted_interest']].sort_values(by=['user', 'predicted_interest'], ascending=[True, False])

# Show the top recommendations
print(recommendations.head(10))


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_merged['birthyear'].fillna(test_merged['birthyear'].mean(), inplace=True)


ValueError: feature_names mismatch: ['invited', 'user_id', 'birthyear', 'timezone', 'timestamp_year', 'timestamp_month', 'timestamp_day', 'start_time_year', 'start_time_month', 'start_time_day', 'gender_female', 'gender_male', 'location_  ', 'location_Aberdeen', 'location_Abuja  31', 'location_American Canyon  California', 'location_Anaheim  CA', 'location_Anaheim  California', 'location_Argy  Flacq  Mauritius', 'location_Atlanta  Georgia', 'location_Bajos De Haina  33', 'location_Bajos De Haina  San Cristobal  Dominican Republic', 'location_Balige  Sumatera Utara  Indonesia', 'location_Banda Aceh  Indonesia', 'location_Bandar Lampung', 'location_Bandung  30', 'location_Bandung  Indonesia', 'location_Bandung Dua  Jawa Barat  Indonesia', 'location_Bangalore  India', 'location_Bangko  Jambi  Indonesia', 'location_Bantoel  Yogyakarta  Indonesia', 'location_Bantul', 'location_Banyuwangi  Jawa Timur  Indonesia', 'location_Barboursville  WV', 'location_Barcelona  Spain', 'location_Barrie  Ontario', "location_Bat'Umi  Ajaria  Georgia", 'location_Batac  Ilocos Norte', 'location_Batam  Ouham  Central African Republic', 'location_Batavia  Jawa Barat  Indonesia', 'location_Battamban  Batdambang  Cambodia', 'location_Battambang', 'location_Beau Vallon  Grand Port  Mauritius', 'location_Bekasi', 'location_Bel Air  Flacq  Mauritius', 'location_Belle Mare  Flacq  Mauritius', 'location_Berastagi', 'location_Berkeley  California', 'location_Beverly Hills  California', 'location_Bihar Sharif', 'location_Binjai', 'location_Bogor  30', 'location_Bogor  Indonesia', 'location_Boston  Massachusetts', 'location_Boston  New York', 'location_Branford  Connecticut', 'location_Bronx  NY', 'location_Brooklyn  NY', 'location_Buena Park  California', 'location_Bumiayu  Jawa Tengah  Indonesia', 'location_Burlington  Ontario', 'location_Cachoeirinha', 'location_Cairo  11', 'location_Camp De Masque  Flacq  Mauritius', 'location_Camp Thorel  Moka  Mauritius', 'location_Caracas  Venezuela', 'location_Casablanca  45', 'location_Cassis  Port Louis  Mauritius', 'location_Cebu City', 'location_Cengkareng  Jakarta Raya  Indonesia', 'location_Centre de Flacq  Mauritius', 'location_Chandigarh  India', 'location_Ciamis  Jawa Barat  Indonesia', 'location_Cikampek  Jawa Barat  Indonesia', 'location_Cilacap  Jawa Tengah  Indonesia', 'location_Cilegon', 'location_Cilegong  Jawa Barat  Indonesia', 'location_Cimanggis  Jawa Barat  Indonesia', 'location_Cirebon', 'location_Commerce  California', 'location_Compton  California', 'location_Corona  CA', 'location_Corona  California', 'location_Costa Mesa  CA', 'location_Cottage  Riviere Du Rempart  Mauritius', 'location_Cottage Grove  Minnesota', 'location_Covina  CA', 'location_Coyote  CA', 'location_Cruce De Guayacanes  Valverde  Dominican Republic', 'location_Curepipe  Mauritius', 'location_Curup  Bengkulu  Indonesia', 'location_DKI Jakarta', 'location_Dabadie  Saint George  Trinidad And Tobago', 'location_Dallas  Texas', 'location_Delhi  07', 'location_Depok Dua Timur  Jawa Barat  Indonesia', 'location_Derby  Connecticut', 'location_Desert Hot Springs  CA', 'location_Djokja  Yogyakarta  Indonesia', 'location_Djokjakarta  Yogyakarta  Indonesia', 'location_Doha  01', 'location_Dortmund  Germany', 'location_Duarte  California', 'location_Dudhnai  Assam  India', 'location_El Monte  CA', 'location_Epsom', 'location_Eugene  Oregon', 'location_Flacq  Flacq  Mauritius', 'location_Fond du Sac  Mauritius', 'location_Fontana  CA', 'location_Fontana  California', 'location_Fort Lauderdale  Florida', 'location_Fredericton  New Brunswick', 'location_Gamping  Yogyakarta  Indonesia', 'location_Garden Grove  California', 'location_Gardena  CA', 'location_Gazipur  Dhaka  Bangladesh', 'location_Ghhanike  Punjab  Pakistan', 'location_Gilbert  Arizona', 'location_Glendale  Arizona', 'location_Gorontalo', 'location_Guangzhou  China', 'location_Guaricano  Distrito Nacional  Dominican Republic', 'location_Hacienda Heights  California', 'location_Hamilton  Ontario', 'location_Hangzhou  China', 'location_Hanoi  44', 'location_Hargeisa  Somalia', 'location_Hargesa  Woqooyi Galbeed  Somalia', 'location_Haveli  Punjab  Pakistan', 'location_Hayward  CA', 'location_Heatherton  07', 'location_Hemet  California', 'location_High Point  North Carolina', 'location_Highland Falls  New York', 'location_Hoboken  New Jersey', 'location_Hollywood  California', 'location_Huntington Beach  California', 'location_Ibiza  Islas Baleares  Spain', 'location_Indrapura  Sumatera Utara  Indonesia', 'location_Irvine  California', 'location_Isabelita  Distrito Nacional  Dominican Republic', 'location_Islamabad  Pakistan', 'location_Jakarta  04', 'location_Jakarta  Indonesia', 'location_Jayapura  Indonesia', 'location_Jeddah  14', 'location_Jogjakarta  Indonesia', 'location_Jokjakarta  Yogyakarta  Indonesia', 'location_Jombang  Jawa Timur  Indonesia', 'location_Kabanjahe', 'location_Kalasan  Yogyakarta  Indonesia', 'location_Kalibo Town  Aklan  Philippines', 'location_Kampala  Uganda', 'location_Kampot  Cambodia', 'location_Karachi  Pakistan', 'location_Kebumen  Jawa Tengah  Indonesia', 'location_Kerman  Iran', 'location_Khulna', 'location_Klaten  Jawa Tengah  Indonesia', 'location_Kompong Cham  Kampong Cham  Cambodia', 'location_Konya  Turkey', 'location_Korea  Puerto Rico', 'location_Kota  Riau  Indonesia', 'location_Kotagede  Yogyakarta  Indonesia', 'location_Krapyak  Yogyakarta  Indonesia', 'location_Kuala Lumpur  Malaysia', 'location_Kudus', 'location_Kupang  18', 'location_Kuta Raja  Aceh  Indonesia', 'location_Kwidzynia  Elblag  Poland', 'location_Kyiv  Ukraine', 'location_La Flora  Savanne  Mauritius', 'location_Lagos  Nigeria', 'location_Laguboti  Sumatera Utara  Indonesia', 'location_Lahat  Sumatera Selatan  Indonesia', 'location_Lahore  04', 'location_Lake Forest  California', 'location_Lakewood  California', 'location_Lalmatie  Flacq  Mauritius', 'location_Leamington  Ontario', 'location_Liverpool  H8', 'location_London  ON', 'location_London  Ohio', 'location_London  Ontario', 'location_London  United Kingdom', 'location_Long Beach  CA', 'location_Long Beach  California', 'location_Los Angeles  CA', 'location_Los Angeles  California', 'location_Lowell  Massachusetts', 'location_Lukachukai  AZ', 'location_Lynwood  California', 'location_Madrid  Spain', 'location_Magelang', 'location_Makassar', 'location_Malang', 'location_Manhattan  New York', 'location_Manila  Philippines', 'location_Mao Adentro  Valverde  Dominican Republic', 'location_Mataram', 'location_Mayaguez  00', 'location_Medan  26', 'location_Medan  Indonesia', 'location_Medina  Saudi Arabia', 'location_Melrose  Moka  Mauritius', 'location_Mesa  Arizona', 'location_Mexico City  Mexico', 'location_Miami  Florida', 'location_Middletown  Orange County  New York', 'location_Mill Valley  California', 'location_Millbrae  California', 'location_Milton  Ontario', 'location_Miskolc  Hungary', 'location_Mississauga  Ontario', 'location_Moka  Mauritius', 'location_Montalban  Rizal', 'location_Montasik  Aceh  Indonesia', 'location_Montclair  NJ', 'location_Montebello  CA', 'location_Montebello  California', 'location_Montreal  Quebec', 'location_Moscow  Russia', 'location_Mount Vernon  NY', 'location_Mulia  35', 'location_Multan', 'location_Murcia  Murcia', 'location_Nagpur', 'location_New Delhi  India', 'location_New Haven  CT', 'location_New Haven  Connecticut', 'location_New Kingston  Saint Andrew  Jamaica', 'location_New London  Connecticut', 'location_New York  New York', 'location_Newark  New Jersey', 'location_North Highlands  California', 'location_North Richmond  New South Wales  Australia', 'location_Norwalk  California', 'location_Nouvelle France  Grand Port  Mauritius', 'location_Oakland  California', 'location_Oberhausen  Germany', 'location_Ontario  California', 'location_Orleans  Ontario', 'location_Oshawa  Ontario', 'location_Ottawa  ON', 'location_Ottawa  Ontario', 'location_Pacoima  CA', 'location_Padang Sidempuan', 'location_Padangsidimpuan  Sumatera Utara  Indonesia', 'location_Pailles Village  Moka  Mauritius', 'location_Palembang', 'location_Palembang  Indonesia', 'location_Palermo  Italy', 'location_Palmdale  California', 'location_Palo Alto  California', 'location_Pamplemousses  Pamplemousses  Mauritius', 'location_Pangkalanbrandan  Sumatera Utara  Indonesia', 'location_Panorama City  CA', 'location_Paramount  CA', 'location_Paris  France', 'location_Pasadena  CA', 'location_Pasadena  California', 'location_Pekalongan  Indonesia', 'location_Pekanbaru', 'location_Pematangsiantar', 'location_Pematangsieantar  Sumatera Utara  Indonesia', 'location_Peoria  Arizona', 'location_Petit Raffray  Riviere Du Rempart  Mauritius', 'location_Philadelphia  Pennsylvania', 'location_Phnom Pen  Phnum Penh  Cambodia', 'location_Phnom Penh', 'location_Phnom Penh  11', 'location_Phoenix  AZ', 'location_Phoenix  Arizona', 'location_Placentia  California', 'location_Plaine Des Papayes  Pamplemousses  Mauritius', 'location_Poipet', 'location_Pomona  California', 'location_Pontianak  Indonesia', 'location_Porsea  Sumatera Utara  Indonesia', 'location_Port Harcourt  50', 'location_Port Louis  18', 'location_Port Louis  Mauritius', 'location_Poughkeepsie  NY', 'location_Probolinggo', 'location_Pune  16', 'location_Purwokerto  Jawa Tengah  Indonesia', 'location_Purworejo  Jawa Tengah  Indonesia', 'location_Quatre Bornes  17', 'location_Quatre Bornes  Mauritius', 'location_Queens  New York', 'location_Quetta  02', 'location_Quetta  Pakistan', 'location_Quito  Ecuador', 'location_Rancho Palos Verdes  California', 'location_Rantauprapat  Sumatera Utara  Indonesia', 'location_Redondo Beach  California', 'location_Riverside  CA', 'location_Roches Noires  Riviere Du Rempart  Mauritius', 'location_Rochester  New York', 'location_Rose Belle  Grand Port  Mauritius', 'location_Rose Hill  Mauritius', 'location_Saint Domingue  Distrito Nacional  Dominican Republic', 'location_Saint Hubert  Grand Port  Mauritius', 'location_Salalah  Oman', 'location_Salatiga', 'location_Salinas  CA', 'location_San Clemente  California', 'location_San Diego  California', 'location_San Felipe de Puerto Plata', 'location_San Francisco  CA', 'location_San Francisco  California', 'location_San Gabriel  California', 'location_San Jacinto  California', 'location_San Jose  California', 'location_San Lorenzo  Santa Fe', 'location_San Pedro  CA', 'location_Santa Ana  California', 'location_Santa Clara  California', 'location_Santa Clarita  California', 'location_Santa Fe Springs  California', 'location_Santiago De Los Caballeros  Santiago  Dominican Republic', 'location_Santo Domingo  05', 'location_Santo Domingo  Dominican Republic', 'location_Santos  Sao Paulo  Brazil', 'location_Scottsdale  AZ', 'location_Scottsdale  Arizona', 'location_Semarang  07', 'location_Semarang  Indonesia', 'location_Seoul  11', 'location_Seoul  Korea', 'location_Sheffield', 'location_Shelton  Connecticut', 'location_Shinjuku-ku  Tokyo  Japan', 'location_Siantar', 'location_Sibolga', 'location_Siborongborong  Sumatera Utara  Indonesia', 'location_Sidoarjo  Jawa Timur  Indonesia', 'location_Siem Reap', 'location_Sierra Madre  California', 'location_Sigli  Aceh  Indonesia', 'location_Sihanoukville', 'location_Simi Valley  California', 'location_Singkil  Aceh  Indonesia', 'location_Sleman', 'location_Soerabaya', 'location_Solo City', 'location_Soroako', 'location_South Kenosha  Wisconsin', 'location_Spanish Town  10', 'location_Stanford  California', 'location_Staten Island  New York', 'location_Stoney Creek  ON', 'location_Sudbury  Ontario', 'location_Surabaya', 'location_Surabaya  08', 'location_Surabaya  Indonesia', 'location_Surakarta  07', 'location_Surakarta  Indonesia', 'location_Sydney  02', 'location_Sydney  Australia', 'location_Tabanan  02', 'location_Takengon  Aceh  Indonesia', 'location_Tangerang', 'location_Tangerangscheweg  Jawa Barat  Indonesia', 'location_Tanjungbalai  Sumatera Utara  Indonesia', 'location_Tarutung  Sumatera Utara  Indonesia', 'location_Tasikmalaya', 'location_Tbilisi  Georgia', 'location_Tebing Tinggi', 'location_Temanggung  Jawa Tengah  Indonesia', 'location_Tempe  Arizona', 'location_Terre Rouge  Pamplemousses  Mauritius', 'location_Thimphu  Bhutan', 'location_Tonekabon', 'location_Toronto  ON', 'location_Toronto  Ontario', 'location_Torrance  CA', 'location_Trabek  05', 'location_Triolet  Mauritius', 'location_Trou Aux Biches  Pamplemousses  Mauritius', "location_Trou D'Eau Douce  Flacq  Mauritius", 'location_Tucson  Arizona', 'location_Turin  Italy', 'location_Udaipur  Rajasthan', 'location_Union Park  Grand Port  Mauritius', 'location_Vaughan  Ontario', 'location_Venice  California', 'location_Villa Consuelo', 'location_Villa Mella  Dominican Republic', 'location_Ville De Phnom Penh  Phnum Penh  Cambodia', 'location_Waianae  Hawaii', 'location_Waterbury  Connecticut', 'location_Wellesley  Massachusetts', 'location_Westminster  California', 'location_Whittier  California', 'location_Wonogiri  Jawa Tengah  Indonesia', 'location_Wonosobo  Jawa Tengah  Indonesia', 'location_Woodland Hills  California', 'location_Yogyakarta', 'location_Yogyakarta  10', 'location_Yonkers  New York', 'location_Zhonghe  Heilongjiang  China', 'location_undefined  undefined'] ['invited', 'user_id', 'birthyear', 'timezone', 'yes_count', 'no_count', 'maybe_count', 'timestamp_year', 'timestamp_month', 'timestamp_day', 'joinedAt_year', 'joinedAt_month', 'joinedAt_day', 'locale_ar_AR', 'locale_bs_BA', 'locale_de_DE', 'locale_en_GB', 'locale_en_US', 'locale_es_ES', 'locale_es_LA', 'locale_fa_IR', 'locale_fr_FR', 'locale_id_ID', 'locale_it_IT', 'locale_ja_JP', 'locale_ka_GE', 'locale_km_KH', 'locale_ms_MY', 'locale_pt_BR', 'locale_ru_RU', 'locale_sv_SE', 'locale_zh_CN', 'gender_female', 'gender_male', 'location_  ', 'location_Acireale', 'location_Addis Ababa  Ethiopia', 'location_Aeknabara  Sumatera Utara  Indonesia', 'location_Agoura Hills  California', 'location_Alexandria  Egypt', 'location_Alexandria  VA', 'location_Aliso Viejo  CA', 'location_Allentown  PA', 'location_Altadena  CA', 'location_Amsterdam  Netherlands', 'location_Anaheim  California', 'location_Austin  TX', 'location_Avondale  Arizona', 'location_Aylmer  Quebec', 'location_Bahawalpur', 'location_Banda  Aceh  Indonesia', 'location_Banda Aceh  01', 'location_Banda Aceh  Indonesia', 'location_Banda Atjeh  Aceh  Indonesia', 'location_Bandar Lampung', 'location_Bandar Lampung  Indonesia', 'location_Bandung  Indonesia', 'location_Bangalore  India', 'location_Banlung', 'location_Bantoel  Yogyakarta  Indonesia', 'location_Bantul', 'location_Banyuwangi  Jawa Timur  Indonesia', 'location_Barrie  Ontario', 'location_Batam  40', 'location_Battamban  Batdambang  Cambodia', 'location_Battambang', 'location_Bavet  Svay Rieng  Cambodia', 'location_Bekasi', 'location_Bel Air  Flacq  Mauritius', 'location_Belawan', 'location_Belle Mare  Flacq  Mauritius', 'location_Bellflower  CA', 'location_Beverly Hills  California', 'location_Binji', 'location_Blora  Jawa Tengah  Indonesia', 'location_Bogor  Indonesia', 'location_Boston  Massachusetts', 'location_Brampton  ON', 'location_Brampton  Ontario', 'location_Bridgeport  Connecticut', 'location_Bronx  New York', 'location_Buena Park  California', 'location_Cagayan de Oro  Philippines', 'location_California City  California', 'location_Camp De Masque  Flacq  Mauritius', 'location_Canberra  Australian Capital Territory', 'location_Ciamis  Jawa Barat  Indonesia', 'location_Cilacap  Jawa Tengah  Indonesia', 'location_Cilegon', 'location_Cirebon', 'location_Cluny  Grand Port  Mauritius', 'location_Compton  California', 'location_Congomah  16', 'location_Costa Mesa  CA', 'location_Cristo Rey  Distrito Nacional', 'location_Curepipe  Mauritius', 'location_Cypress  California', 'location_DKI Jakarta', 'location_Daly City  California', 'location_Denpasar  Bali  Indonesia', 'location_Djambi  Jambi  Indonesia', 'location_Djelfa  22', 'location_Djokja  Yogyakarta  Indonesia', 'location_Downey  CA', 'location_Dubai  United Arab Emirates', 'location_Dumai', 'location_Edmonton  Alberta', 'location_Emo  Ontario', 'location_Fagersta  25', 'location_Flacq  Flacq  Mauritius', 'location_Fontana  California', 'location_Gamping  Yogyakarta  Indonesia', 'location_Garut 1  Jawa Barat  Indonesia', 'location_Grand Gaube  Mauritius', 'location_Grande Pointe Aux Piments  Pamplemousses  Mauritius', 'location_Grande Prairie  AB', 'location_Guria', 'location_Hamilton  Ontario', 'location_Hanoi  44', 'location_Hargeisa  Somalia', 'location_Hoboken  New Jersey', 'location_Hollywood  California', 'location_Houston  Texas', 'location_Huntington Beach  California', 'location_Irvine  CA', 'location_Irvine  California', 'location_Islamabad  08', 'location_Jakarta  04', 'location_Jakarta  Indonesia', 'location_Jeddah  14', 'location_Jogjakarta  Indonesia', 'location_Jokjakarta  Yogyakarta  Indonesia', 'location_Jombang  Jawa Timur  Indonesia', 'location_Kabanjahe', 'location_Kampala  Uganda', 'location_Kendal  Indonesia', 'location_Kitchener  Ontario', 'location_Klaten  Jawa Tengah  Indonesia', 'location_Kosciusko  Mississippi', 'location_Kotagede  Yogyakarta  Indonesia', 'location_Kuala Lumpur  Malaysia', 'location_Kudus', 'location_Kuningan Satu  Jawa Barat  Indonesia', 'location_Kuwait City', 'location_La Habra Heights  California', 'location_Lahat  Sumatera Selatan  Indonesia', 'location_Langsa', 'location_Lasem  Jawa Tengah  Indonesia', 'location_Laventure  13', 'location_Lawndale  California', 'location_Lawrence  Massachusetts', 'location_Lindsay  ON', 'location_London  Ontario', 'location_London  United Kingdom', 'location_Long Beach  California', 'location_Los Altos  CA', 'location_Los Angeles  CA', 'location_Los Angeles  California', 'location_Los Gatos  California', 'location_Madrid  Spain', 'location_Makati  D9', 'location_Malang', 'location_Malipampang  Bulacan  Philippines', 'location_Manchester  United Kingdom', 'location_Mankato  Minnesota', 'location_Manoguayabo  Distrito Nacional  Dominican Republic', 'location_Mao  Valverde', 'location_Marysville  New Brunswick', 'location_Mataram', 'location_Medan  26', 'location_Medan  Indonesia', 'location_Melbourne  07', 'location_Mesa  Arizona', 'location_Miami  Florida', 'location_Miami Beach  Florida', 'location_Miches', 'location_Minneapolis  Minnesota', 'location_Mississauga  Ontario', 'location_Monrovia  CA', 'location_Montclair  California', 'location_Monterey Park  CA', 'location_Moreno Valley  California', 'location_Mumbai  16', 'location_Mumbai  Maharashtra  India', 'location_Negara  Bali  Indonesia', 'location_New Delhi  India', 'location_New Haven  Connecticut', 'location_New York  New York', 'location_Newport Beach  California', 'location_Nieuw Singkil  Aceh  Indonesia', 'location_North Hollywood  CA', 'location_Oceanside  California', 'location_Orange  California', 'location_Ottawa  ON', 'location_Ottawa  Ontario', 'location_Pacoima  CA', 'location_Padang  Indonesia', 'location_Padangsidimpuan  Sumatera Utara  Indonesia', 'location_Palangkaraya', 'location_Pamplemousses  Pamplemousses  Mauritius', 'location_Paramount  California', 'location_Parapat  Sumatera Utara  Indonesia', 'location_Paris  France', 'location_Pasig', 'location_Paterson  New Jersey', 'location_Pekanbaru', 'location_Pematangraya  Sumatera Utara  Indonesia', 'location_Pematangsiantar', 'location_Pematangsieantar  Sumatera Utara  Indonesia', 'location_Peoria  Arizona', 'location_Petit Raffray  Riviere Du Rempart  Mauritius', 'location_Phnom Pen  Phnum Penh  Cambodia', 'location_Phnom Penh', 'location_Phnom Penh  11', 'location_Phoenix  Arizona', 'location_Piton  Riviere Du Rempart  Mauritius', 'location_Plaine Des Papayes  Pamplemousses  Mauritius', 'location_Pnom Penh  Phnum Penh  Cambodia', 'location_Pontianak  Indonesia', 'location_Porsea  Sumatera Utara  Indonesia', 'location_Port Louis  18', 'location_Port Louis  Mauritius', 'location_Port Louis Town  Port Louis  Mauritius', 'location_Puncak Alam', 'location_Purwokerto  Jawa Tengah  Indonesia', 'location_Purworejo  Jawa Tengah  Indonesia', 'location_Quatre Bornes  17', 'location_Quatre Bornes  Mauritius', 'location_Quetta  Pakistan', 'location_Quezon City  F2', 'location_Quezon City  Philippines', 'location_Rajapolah  Jawa Barat  Indonesia', 'location_Rancho Santa Margarita  CA', 'location_Rantauprapat  Sumatera Utara  Indonesia', 'location_Rose Belle  Grand Port  Mauritius', 'location_Rose Hill  Mauritius', 'location_Rosemead  CA', 'location_Samarinda', 'location_Sampaloc  53', 'location_San Carlos  05', 'location_San Felipe de Puerto Plata', 'location_San Francisco  CA', 'location_San Francisco  California', 'location_San Jose  California', 'location_San Mateo  CA', 'location_San Pedro  California', 'location_San Rafael  CA', 'location_Sanchez Mira', 'location_Santa Ana  CA', 'location_Santa Clara  California', 'location_Santa Monica  California', 'location_Santiago  25', 'location_Santiago  Chile', 'location_Santiago De Los Caballeros  Santiago  Dominican Republic', 'location_Santo Domingo  05', 'location_Santo Domingo  Dominican Republic', 'location_Sausalito  California', 'location_Scottsdale  AZ', 'location_Sebastopol  Flacq  Mauritius', 'location_Semarang  Indonesia', 'location_Seoul  Korea', 'location_Sialkot  Punjab', 'location_Siantar', 'location_Sibolga', 'location_Sidareja  Jawa Tengah  Indonesia', 'location_Siem Reap', 'location_Sihanoukville', 'location_Singapore  Singapore', 'location_Sleman', 'location_Sleman  10', 'location_Solo City', 'location_South El Monte  CA', 'location_South Gate  California', 'location_South Orange  NJ', 'location_Sragen  Jawa Tengah  Indonesia', 'location_Sunggal  Sumatera Utara  Indonesia', 'location_Surabaya', 'location_Surabaya  08', 'location_Surabaya  Indonesia', 'location_Surakarta', 'location_Surallah  South Cotabato', 'location_Sylmar  CA', 'location_Taguig', 'location_Tangerang', 'location_Tanjungbalai  Sumatera Utara  Indonesia', 'location_Tanta  05', 'location_Tasikmalaya', 'location_Tbilisi  Georgia', 'location_Tebing Tinggi', 'location_Tehran  Iran', 'location_Tempe  Arizona', 'location_Thornhill  Ontario', 'location_Timika  Papua  Indonesia', 'location_Toronto  ON', 'location_Toronto  Ontario', 'location_Torrance  CA', 'location_Torrance  California', 'location_Trabek  05', 'location_Triolet  Mauritius', "location_Trou D'Eau Douce  Flacq  Mauritius", 'location_Tucson  AZ', 'location_Tucson  Arizona', 'location_Tulungagung', 'location_Tuol  03', 'location_Udaipur  Rajasthan', 'location_Ungaran  Jawa Tengah  Indonesia', 'location_Urbana  Illinois', 'location_Vancouver  British Columbia', 'location_Venice  California', 'location_Waterbury  Connecticut', 'location_Waterloo  Ontario', 'location_Weggis  Switzerland', 'location_West Covina  California', 'location_Windsor  Ontario', 'location_Wolcott  Connecticut', 'location_Yogyakarta', 'location_Yogyakarta  10', 'location_Yorba Linda  CA', 'location_Zagreb  Croatia', 'location_Zamboanga City']
expected location_Palembang  Indonesia, location_Nouvelle France  Grand Port  Mauritius, location_Epsom, location_Fontana  CA, location_San Gabriel  California, location_Trou Aux Biches  Pamplemousses  Mauritius, location_Bat'Umi  Ajaria  Georgia, location_North Highlands  California, location_Aberdeen, location_Atlanta  Georgia, location_Lowell  Massachusetts, location_Kalasan  Yogyakarta  Indonesia, location_Rancho Palos Verdes  California, location_Batam  Ouham  Central African Republic, location_Seoul  11, location_Cottage  Riviere Du Rempart  Mauritius, location_Desert Hot Springs  CA, location_High Point  North Carolina, location_Placentia  California, location_Norwalk  California, location_Chandigarh  India, location_Murcia  Murcia, location_Haveli  Punjab  Pakistan, location_Ville De Phnom Penh  Phnum Penh  Cambodia, location_Semarang  07, location_Islamabad  Pakistan, location_Karachi  Pakistan, location_Korea  Puerto Rico, location_Mill Valley  California, location_Bihar Sharif, location_Pasadena  California, location_Wonosobo  Jawa Tengah  Indonesia, location_Stanford  California, location_Guaricano  Distrito Nacional  Dominican Republic, location_New London  Connecticut, location_Surakarta  07, location_Pasadena  CA, location_Soroako, location_Krapyak  Yogyakarta  Indonesia, location_Mount Vernon  NY, location_London  Ohio, start_time_month, location_Commerce  California, location_Ibiza  Islas Baleares  Spain, location_Villa Mella  Dominican Republic, location_Spanish Town  10, location_Barboursville  WV, location_Curup  Bengkulu  Indonesia, location_Montebello  CA, location_New Haven  CT, location_undefined  undefined, location_Takengon  Aceh  Indonesia, location_Miskolc  Hungary, location_Ontario  California, location_Terre Rouge  Pamplemousses  Mauritius, location_Corona  CA, location_Bandung Dua  Jawa Barat  Indonesia, location_Pailles Village  Moka  Mauritius, location_Binjai, location_Heatherton  07, location_Pekalongan  Indonesia, location_Kupang  18, location_Siborongborong  Sumatera Utara  Indonesia, location_Union Park  Grand Port  Mauritius, location_Boston  New York, location_Mao Adentro  Valverde  Dominican Republic, location_Corona  California, location_Pomona  California, location_Pangkalanbrandan  Sumatera Utara  Indonesia, location_Nagpur, location_Sheffield, location_Quetta  02, location_Balige  Sumatera Utara  Indonesia, location_Lakewood  California, location_Fort Lauderdale  Florida, location_Palo Alto  California, location_Montreal  Quebec, location_Cengkareng  Jakarta Raya  Indonesia, location_Villa Consuelo, location_Derby  Connecticut, location_Sierra Madre  California, location_Khulna, location_Gorontalo, location_Cassis  Port Louis  Mauritius, location_Depok Dua Timur  Jawa Barat  Indonesia, location_Indrapura  Sumatera Utara  Indonesia, location_Santa Fe Springs  California, location_Lahore  04, location_Simi Valley  California, location_Kalibo Town  Aklan  Philippines, location_Cairo  11, start_time_day, location_Cikampek  Jawa Barat  Indonesia, location_Kwidzynia  Elblag  Poland, location_Kerman  Iran, location_Palmdale  California, location_Bajos De Haina  33, location_Sigli  Aceh  Indonesia, location_Manhattan  New York, location_Batavia  Jawa Barat  Indonesia, location_North Richmond  New South Wales  Australia, location_Shelton  Connecticut, location_Liverpool  H8, location_Yonkers  New York, location_Batac  Ilocos Norte, location_Redondo Beach  California, location_Cachoeirinha, location_Scottsdale  Arizona, location_Milton  Ontario, location_Staten Island  New York, location_Turin  Italy, location_Brooklyn  NY, location_Phoenix  AZ, location_Makassar, location_London  ON, location_Montasik  Aceh  Indonesia, location_Orleans  Ontario, location_Glendale  Arizona, location_Tabanan  02, location_Dudhnai  Assam  India, location_Bogor  30, location_Kebumen  Jawa Tengah  Indonesia, location_Gardena  CA, location_Branford  Connecticut, location_Panorama City  CA, location_Cilegong  Jawa Barat  Indonesia, location_Berastagi, location_Mayaguez  00, location_Oshawa  Ontario, location_Montalban  Rizal, location_Cimanggis  Jawa Barat  Indonesia, location_Palembang, location_San Clemente  California, location_Waianae  Hawaii, location_Bumiayu  Jawa Tengah  Indonesia, location_Queens  New York, location_Rochester  New York, location_Middletown  Orange County  New York, location_Wonogiri  Jawa Tengah  Indonesia, location_Kampot  Cambodia, location_Surakarta  Indonesia, location_Moka  Mauritius, location_Lagos  Nigeria, location_Lynwood  California, location_San Diego  California, location_Hayward  CA, location_Santa Clarita  California, location_Multan, start_time_year, location_Probolinggo, location_Philadelphia  Pennsylvania, location_Gazipur  Dhaka  Bangladesh, location_Magelang, location_Ghhanike  Punjab  Pakistan, location_Westminster  California, location_Port Harcourt  50, location_Caracas  Venezuela, location_Melrose  Moka  Mauritius, location_Montebello  California, location_Argy  Flacq  Mauritius, location_Abuja  31, location_Covina  CA, location_Hemet  California, location_New Kingston  Saint Andrew  Jamaica, location_Casablanca  45, location_Cebu City, location_Hangzhou  China, location_Hargesa  Woqooyi Galbeed  Somalia, location_San Lorenzo  Santa Fe, location_South Kenosha  Wisconsin, location_Berkeley  California, location_Guangzhou  China, location_Singkil  Aceh  Indonesia, location_Coyote  CA, location_Djokjakarta  Yogyakarta  Indonesia, location_Santa Ana  California, location_Lukachukai  AZ, location_Delhi  07, location_Laguboti  Sumatera Utara  Indonesia, location_Kyiv  Ukraine, location_Bronx  NY, location_Lake Forest  California, location_Jayapura  Indonesia, location_Fond du Sac  Mauritius, location_Padang Sidempuan, location_Isabelita  Distrito Nacional  Dominican Republic, location_Zhonghe  Heilongjiang  China, location_Palermo  Italy, location_Roches Noires  Riviere Du Rempart  Mauritius, location_Stoney Creek  ON, location_American Canyon  California, location_Montclair  NJ, location_Millbrae  California, location_Wellesley  Massachusetts, location_Moscow  Russia, location_Dallas  Texas, location_Riverside  CA, location_Kota  Riau  Indonesia, location_Barcelona  Spain, location_Poughkeepsie  NY, location_La Flora  Savanne  Mauritius, location_Bandung  30, location_Mulia  35, location_Cruce De Guayacanes  Valverde  Dominican Republic, location_Soerabaya, location_Temanggung  Jawa Tengah  Indonesia, location_Salalah  Oman, location_Sidoarjo  Jawa Timur  Indonesia, location_Thimphu  Bhutan, location_Duarte  California, location_Gilbert  Arizona, location_Bajos De Haina  San Cristobal  Dominican Republic, location_Kompong Cham  Kampong Cham  Cambodia, location_Dabadie  Saint George  Trinidad And Tobago, location_Hacienda Heights  California, location_Centre de Flacq  Mauritius, location_San Jacinto  California, location_Mexico City  Mexico, location_Paramount  CA, location_Garden Grove  California, location_Konya  Turkey, location_Dortmund  Germany, location_Whittier  California, location_Sudbury  Ontario, location_Bangko  Jambi  Indonesia, location_Woodland Hills  California, location_Burlington  Ontario, location_Long Beach  CA, location_Lalmatie  Flacq  Mauritius, location_Sydney  Australia, location_Doha  01, location_Sydney  02, location_Manila  Philippines, location_Leamington  Ontario, location_Oberhausen  Germany, location_Cottage Grove  Minnesota, location_El Monte  CA, location_Tangerangscheweg  Jawa Barat  Indonesia, location_Poipet, location_Tarutung  Sumatera Utara  Indonesia, location_Beau Vallon  Grand Port  Mauritius, location_Santos  Sao Paulo  Brazil, location_Camp Thorel  Moka  Mauritius, location_Quito  Ecuador, location_Newark  New Jersey, location_Highland Falls  New York, location_Oakland  California, location_Eugene  Oregon, location_Saint Domingue  Distrito Nacional  Dominican Republic, location_Medina  Saudi Arabia, location_Salatiga, location_Vaughan  Ontario, location_Pune  16, location_Tonekabon, location_Fredericton  New Brunswick, location_Anaheim  CA, location_Shinjuku-ku  Tokyo  Japan, location_Salinas  CA, location_Saint Hubert  Grand Port  Mauritius, location_San Pedro  CA, location_Kuta Raja  Aceh  Indonesia in input data
training data did not have the following fields: location_North Hollywood  CA, location_Denpasar  Bali  Indonesia, location_Pasig, location_Taguig, no_count, location_Sialkot  Punjab, location_Santiago  25, location_Montclair  California, location_Blora  Jawa Tengah  Indonesia, location_Negara  Bali  Indonesia, locale_ru_RU, location_Vancouver  British Columbia, location_Weggis  Switzerland, locale_sv_SE, location_Orange  California, location_Melbourne  07, location_Los Altos  CA, location_South El Monte  CA, location_Urbana  Illinois, location_California City  California, location_Lawrence  Massachusetts, locale_pt_BR, location_Lasem  Jawa Tengah  Indonesia, location_West Covina  California, location_Paramount  California, location_Monterey Park  CA, location_Aliso Viejo  CA, location_Tulungagung, location_Irvine  CA, location_Mumbai  16, location_Batam  40, locale_bs_BA, location_Monrovia  CA, location_Aylmer  Quebec, location_Pematangraya  Sumatera Utara  Indonesia, location_Newport Beach  California, location_Moreno Valley  California, location_Houston  Texas, location_Bavet  Svay Rieng  Cambodia, location_Fagersta  25, location_Djambi  Jambi  Indonesia, location_San Rafael  CA, location_Cristo Rey  Distrito Nacional, location_Makati  D9, location_Cluny  Grand Port  Mauritius, location_Acireale, location_Brampton  Ontario, location_La Habra Heights  California, location_Binji, location_Tucson  AZ, location_Kendal  Indonesia, locale_es_ES, location_Marysville  New Brunswick, location_Tehran  Iran, location_Samarinda, location_Dumai, location_Kosciusko  Mississippi, location_Alexandria  VA, location_Lawndale  California, location_Sanchez Mira, location_Altadena  CA, location_Belawan, location_Lindsay  ON, location_Canberra  Australian Capital Territory, location_Amsterdam  Netherlands, location_Waterloo  Ontario, location_Paterson  New Jersey, location_Rosemead  CA, location_Torrance  California, location_Aeknabara  Sumatera Utara  Indonesia, location_Zamboanga City, locale_ms_MY, location_Santa Ana  CA, location_Yorba Linda  CA, location_Sleman  10, location_Addis Ababa  Ethiopia, location_Piton  Riviere Du Rempart  Mauritius, location_Tuol  03, locale_id_ID, location_Sausalito  California, location_Pnom Penh  Phnum Penh  Cambodia, location_Bahawalpur, location_Downey  CA, location_Sunggal  Sumatera Utara  Indonesia, location_Oceanside  California, location_Rancho Santa Margarita  CA, location_Sebastopol  Flacq  Mauritius, location_Bandar Lampung  Indonesia, location_Bronx  New York, location_Kuningan Satu  Jawa Barat  Indonesia, locale_km_KH, location_Manoguayabo  Distrito Nacional  Dominican Republic, location_Austin  TX, joinedAt_year, location_Grand Gaube  Mauritius, location_Mumbai  Maharashtra  India, location_Banlung, locale_en_GB, joinedAt_day, location_Daly City  California, location_Wolcott  Connecticut, location_Guria, location_Emo  Ontario, location_Tanta  05, location_Manchester  United Kingdom, location_Sampaloc  53, locale_zh_CN, location_Santiago  Chile, locale_it_IT, location_Rajapolah  Jawa Barat  Indonesia, location_Kitchener  Ontario, location_Bridgeport  Connecticut, location_Singapore  Singapore, location_Malipampang  Bulacan  Philippines, location_Zagreb  Croatia, locale_ka_GE, locale_fr_FR, location_Miami Beach  Florida, location_Bellflower  CA, location_Parapat  Sumatera Utara  Indonesia, location_Timika  Papua  Indonesia, location_Alexandria  Egypt, location_South Orange  NJ, location_Thornhill  Ontario, maybe_count, locale_ar_AR, location_Quezon City  Philippines, location_Los Gatos  California, location_Brampton  ON, location_Palangkaraya, location_Agoura Hills  California, location_Mao  Valverde, location_Nieuw Singkil  Aceh  Indonesia, location_Dubai  United Arab Emirates, location_Langsa, location_Djelfa  22, locale_de_DE, location_San Mateo  CA, location_San Pedro  California, location_South Gate  California, locale_es_LA, location_San Carlos  05, location_Windsor  Ontario, location_Banda Aceh  01, location_Grande Prairie  AB, location_Sylmar  CA, location_Allentown  PA, location_Minneapolis  Minnesota, location_Cypress  California, location_Quezon City  F2, location_Sragen  Jawa Tengah  Indonesia, locale_ja_JP, location_Sidareja  Jawa Tengah  Indonesia, joinedAt_month, location_Mankato  Minnesota, location_Miches, location_Edmonton  Alberta, location_Santa Monica  California, location_Port Louis Town  Port Louis  Mauritius, location_Laventure  13, location_Banda Atjeh  Aceh  Indonesia, location_Avondale  Arizona, location_Ungaran  Jawa Tengah  Indonesia, yes_count, location_Garut 1  Jawa Barat  Indonesia, location_Cagayan de Oro  Philippines, location_Banda  Aceh  Indonesia, location_Grande Pointe Aux Piments  Pamplemousses  Mauritius, location_Surakarta, location_Kuwait City, location_Padang  Indonesia, location_Islamabad  08, location_Puncak Alam, locale_fa_IR, location_Surallah  South Cotabato, locale_en_US, location_Congomah  16

In [44]:
# Convert birthyear to numeric
test_merged['birthyear'] = pd.to_numeric(test_merged['birthyear'], errors='coerce')

# One-hot encode categorical columns (if not already done)
test_merged = pd.get_dummies(test_merged, columns=['locale', 'gender', 'location'], drop_first=True)

# Ensure the columns match between X_train and X_test
X_test = test_merged.reindex(columns=X_train.columns, fill_value=0)

# Now you can make predictions
y_test_pred = model.predict_proba(X_test)[:, 1]

# Output predictions (ranking of events by likelihood of user interest)
test_merged['predicted_interest'] = y_test_pred
recommendations = test_merged[['user', 'event', 'predicted_interest']].sort_values(by=['user', 'predicted_interest'], ascending=[True, False])

# Display top recommendations
print(recommendations.head(10))



      user       event  predicted_interest
0  1776192  2877501688            0.220296
1  1776192  3025444328            0.220296
2  1776192  4078218285            0.220296
3  1776192  1024025121            0.220296
4  1776192  2972428928            0.220296
5  1776192  2514143386            0.220296
6  1776192  1823369186            0.220296
7  5161061  2027962693            0.227554
8  5161061  1652007005            0.227554
9  5161061  2169802745            0.227554


In [45]:
# Cell for making predictions on the test set
y_test_pred = model.predict_proba(X_test)[:, 1]

# Output predictions (ranking of events by likelihood of user interest)
test_merged['predicted_interest'] = y_test_pred
recommendations = test_merged[['user', 'event', 'predicted_interest']].sort_values(by=['user', 'predicted_interest'], ascending=[True, False])

# Show the top recommendations
print(recommendations.head(10))



      user       event  predicted_interest
0  1776192  2877501688            0.220296
1  1776192  3025444328            0.220296
2  1776192  4078218285            0.220296
3  1776192  1024025121            0.220296
4  1776192  2972428928            0.220296
5  1776192  2514143386            0.220296
6  1776192  1823369186            0.220296
7  5161061  2027962693            0.227554
8  5161061  1652007005            0.227554
9  5161061  2169802745            0.227554


In [46]:
# Cell 7: Train the final model with the best hyperparameters
best_params = study.best_trial.params
final_model = xgb.XGBClassifier(**best_params)
final_model.fit(X, y)


In [48]:
# Convert 'birthyear' to numeric
test_merged['birthyear'] = pd.to_numeric(test_merged['birthyear'], errors='coerce')

# Convert 'timestamp' and 'joinedAt' into year, month, and day features
test_merged['timestamp_year'] = pd.to_datetime(test_merged['timestamp'], errors='coerce').dt.year
test_merged['timestamp_month'] = pd.to_datetime(test_merged['timestamp'], errors='coerce').dt.month
test_merged['timestamp_day'] = pd.to_datetime(test_merged['timestamp'], errors='coerce').dt.day

test_merged['joinedAt_year'] = pd.to_datetime(test_merged['joinedAt'], errors='coerce').dt.year
test_merged['joinedAt_month'] = pd.to_datetime(test_merged['joinedAt'], errors='coerce').dt.month
test_merged['joinedAt_day'] = pd.to_datetime(test_merged['joinedAt'], errors='coerce').dt.day

# Drop the original 'timestamp' and 'joinedAt' columns since we've extracted the necessary features
test_merged = test_merged.drop(columns=['timestamp', 'joinedAt'])

# One-hot encode categorical columns like 'locale', 'gender', 'location'
test_merged = pd.get_dummies(test_merged, columns=['locale', 'gender', 'location'], drop_first=True)

# Reindex to ensure X_test matches the columns of X_train
X_test = test_merged.reindex(columns=X_train.columns, fill_value=0)

# Now you can make predictions
y_test_pred = model.predict_proba(X_test)[:, 1]

# Output predictions (ranking of events by likelihood of user interest)
test_merged['predicted_interest'] = y_test_pred
recommendations = test_merged[['user', 'event', 'predicted_interest']].sort_values(by=['user', 'predicted_interest'], ascending=[True, False])

# Display top recommendations
print(recommendations.head(10))




      user       event  predicted_interest
0  1776192  2877501688            0.210123
1  1776192  3025444328            0.210123
2  1776192  4078218285            0.210123
3  1776192  1024025121            0.210123
4  1776192  2972428928            0.210123
5  1776192  2514143386            0.210123
6  1776192  1823369186            0.210123
7  5161061  2027962693            0.359215
8  5161061  1652007005            0.359215
9  5161061  2169802745            0.359215


In [49]:
# Cell 9: Compute Mean Average Precision at 200 for validation set
def mean_average_precision(y_true, y_pred):
    return average_precision_score(y_true, y_pred)

y_pred_val = final_model.predict_proba(X_val)[:, 1]
map_score = mean_average_precision(y_val, y_pred_val)
print(f"Mean Average Precision at 200: {map_score}")


Mean Average Precision at 200: 0.6014549699206235
