In [1]:
# importing the required libraries

import pandas as pd
import numpy as np
import geopy
from geopy.geocoders import Nominatim
import geopy.distance
from sklearn.model_selection import train_test_split
from catboost import Pool, CatBoostClassifier

## Loading the Data

In [4]:
train          = pd.read_csv("../../train.csv")
test           = pd.read_csv("../../test.csv")
bikers         = pd.read_csv("../../bikers.csv")
bikers_network = pd.read_csv("../../bikers_network.csv")
tours          = pd.read_csv("../../tours.csv")
tour_convoy    = pd.read_csv("../../tour_convoy.csv")

*like and dislike provide the same information,ie (dilike=1-like) hence the feature dislike is dropped and the feature like
is used as the target feature*

In [5]:
output = train['like']
train.drop(columns = ['like','dislike'], inplace = True)

*Merging the datasets (train and bikers),(test and bikers) in order to extract useful features*

In [8]:
# train data
df_comb = pd.merge(train, bikers, how = 'left', on = 'biker_id')
df_comb = pd.merge(df_comb, tours, how = 'left', on = 'tour_id')

# test Data
df1_comb = pd.merge(test, bikers, how = 'left', on = 'biker_id')
df1_comb = pd.merge(df1_comb, tours, how = 'left', on = 'tour_id')



# Data Preprocessing and Feature Engineering

Let us analyse the features one by one 

### Analyzing   bornIn

*we can see that there are some None values , which we are going to convert into an empty string for future use*

In [36]:
def born(x):
    if str(x) == 'None':
        return np.nan
    else:
        return x
    
df_comb['bornIn'] = df_comb['bornIn'].apply(lambda x: born(x))
df1_comb['bornIn'] = df1_comb['bornIn'].apply(lambda x: born(x))

###  Biker Latitude and Longitude

*The latitudes and longitudes of tours are given , the latitudes and longitudes of the biker country are computed so that the distance between the biker country and tour country can be calculated*

#### Train Data

In [37]:
# extracting the unique area values
area1 = pd.unique(df_comb['area'])

array(['Binjai', 'Beau Vallon  Grand Port  Mauritius',
       'Phnom Pen  Phnum Penh  Cambodia', 'Los Angeles  California',
       'Tangerang', 'Medan  Indonesia', 'Yogyakarta', 'Bekasi', nan,
       'Heatherton  07', 'Fontana  CA',
       'Lahat  Sumatera Selatan  Indonesia', 'Hollywood  California',
       'Berkeley  California', 'El Monte  CA',
       'Terre Rouge  Pamplemousses  Mauritius', 'Jogjakarta  Indonesia',
       'Cilegon', 'Phnom Penh  11', 'Djokja  Yogyakarta  Indonesia',
       'Phnom Penh', 'Rantauprapat  Sumatera Utara  Indonesia',
       'Flacq  Flacq  Mauritius', 'Ottawa  Ontario', 'Jakarta  04',
       'Los Angeles  CA', 'Boston  Massachusetts', 'Turin  Italy',
       'Toronto  ON', 'Riverside  CA', 'Hargeisa  Somalia',
       'Gazipur  Dhaka  Bangladesh', 'Cirebon', 'Norwalk  California',
       'Pematangsiantar', '  ', 'Triolet  Mauritius',
       'undefined  undefined', 'Kyiv  Ukraine', 'Siem Reap',
       'Pamplemousses  Pamplemousses  Mauritius', 'Tabanan  02'

*As geopy gives time out errors , we have hardcoded the latitudes and longitudes that we initially got from geopy.*

In [39]:
dict1 = {'Binjai': (3.6063964, 98.4899865),
 'Beau Vallon  Grand Port  Mauritius': (-20.4249, 57.7027),
 'Phnom Pen  Phnum Penh  Cambodia': (11.5757985, 104.9179179),
 'Los Angeles  California': (34.0536909, -118.24276599999999),
 'Tangerang': (-6.176031099999999, 106.63844679999998),
 'Medan  Indonesia': (3.5896654000000003, 98.6738261),
 'Yogyakarta': (-7.977838399999999, 110.36722565020223),
 'Bekasi': (-6.2349858, 106.99454440000001),
 'Heatherton  07': (33.73131718367347, -117.78149081632652),
 'Fontana  CA': (34.0922335, -117.435048),
 'Lahat  Sumatera Selatan  Indonesia': (-3.87934575, 103.41408633497845),
 'Hollywood  California': (34.0980031, -118.329523),
 'Berkeley  California': (37.8708393, -122.27286389999999),
 'El Monte  CA': (34.0751571, -118.036849),
 'Terre Rouge  Pamplemousses  Mauritius': (-20.126423000000003,
  57.53578100000001),
 'Jogjakarta  Indonesia': (-7.801194500000001, 110.36491699999999),
 'Cilegon': (-6.017389, 106.05376880000001),
 'Phnom Penh  11': (11.568271000000001, 104.9224426),
 'Djokja  Yogyakarta  Indonesia': (-7.7956, 110.3695),
 'Phnom Penh': (11.568271000000001, 104.9224426),
 'Rantauprapat  Sumatera Utara  Indonesia': (2.1000330000000003,
  99.8271935838787),
 'Flacq  Flacq  Mauritius': (-20.1948891, 57.7229652),
 'Ottawa  Ontario': (45.421106, -75.690308),
 'Jakarta  04': (-6.1753942, 106.827183),
 'Los Angeles  CA': (34.0536909, -118.24276599999999),
 'Boston  Massachusetts': (42.3602534, -71.0582912),
 'Turin  Italy': (45.06775510000001, 7.682489200000001),
 'Toronto  ON': (43.6534817, -79.38393470000001),
 'Riverside  CA': (33.953354600000004, -117.39616229999999),
 'Hargeisa  Somalia': (9.561733, 44.061803000000005),
 'Gazipur  Dhaka  Bangladesh': (23.998079699999998, 90.4229848),
 'Cirebon': (-6.713704400000001, 108.5608483),
 'Norwalk  California': (33.909280200000005, -118.0849169),
 'Pematangsiantar': (3.5741592000000004, 98.6884926),
 'Triolet  Mauritius': (-20.057474300000003, 57.5521877),
 'undefined  undefined': (36.01952335, 14.238564982829594),
 'Kyiv  Ukraine': (50.4500336, 30.524136100000003),
 'Siem Reap': (13.3617562, 103.8590321),
 'Pamplemousses  Pamplemousses  Mauritius': (-20.1107201, 57.5764944),
 'Tabanan  02': (-8.5392306, 115.1265683),
 'Santo Domingo  Dominican Republic': (18.4801972, -69.942111),
 'Bangko  Jambi  Indonesia': (-2.0821883000000003, 102.2633895),
 'New Kingston  Saint Andrew  Jamaica': (18.0052171, -76.7862589),
 'Wonogiri  Jawa Tengah  Indonesia': (-7.9620507, 110.95757325155418),
 'Barcelona  Spain': (41.3828939, 2.1774322),
 'Pekalongan  Indonesia': (-7.042220599999999, 109.61973085835145),
 'Pematangsieantar  Sumatera Utara  Indonesia': (3.5741592000000004,
  98.6884926),
 'Purworejo  Jawa Tengah  Indonesia': (-7.7073021, 109.96651169154025),
 'Huntington Beach  California': (33.6783336, -118.0000166),
 'Medan  26': (31.329779700000003, 31.7165375),
 'Purwokerto  Jawa Tengah  Indonesia': (-7.4279011, 109.24085009999999),
 'Tbilisi  Georgia': (41.6934591, 44.8014495),
 'Union Park  Grand Port  Mauritius': (-20.4319438, 57.661488899999995),
 'Mexico City  Mexico': (19.4326296, -99.1331785),
 'Long Beach  California': (33.7690164, -118.191604),
 'Berastagi': (3.1928704, 98.5091805),
 'Pekanbaru': (0.5262455, 101.45157270000001),
 'San Jacinto  California': (33.7839664, -116.95890859999999),
 'Trabek  05': (51.5627778, 21.0722222),
 'Lukachukai  AZ': (36.407754499999996, -109.22696452080706),
 'Sihanoukville': (10.5045515, 103.25784114285527),
 'Jakarta  Indonesia': (-6.1753942, 106.827183),
 'Surabaya  08': (-7.2459717, 112.73782659999998),
 'Kompong Cham  Kampong Cham  Cambodia': (11.9921855, 105.4628346),
 'Hangzhou  China': (30.248963399999997, 120.20523419999999),
 'Murcia  Murcia': (37.9923795, -1.1305431000000001),
 'Sierra Madre  California': (34.1616729, -118.0528456),
 'Beverly Hills  California': (34.0696501, -118.3963062),
 'Sibolga': (1.7369569999999999, 98.78461),
 'London  United Kingdom': (51.5073219, -0.1276474),
 'Waianae  Hawaii': (21.4454875, -158.18782389776405),
 'Tucson  Arizona': (32.2228765, -110.9748477),
 'Toronto  Ontario': (43.6534817, -79.38393470000001),
 'Poughkeepsie  NY': (41.706553899999996, -73.9283672),
 'Camp De Masque  Flacq  Mauritius': (-20.2351277, 57.676299490855435),
 'DKI Jakarta': (-6.1753942, 106.827183),
 'Port Harcourt  50': (4.7676576, 7.0188527),
 'Palembang': (-2.9888297, 104.756857),
 'Hanoi  44': (21.0189387, 105.84864750000001),
 'Manhattan  New York': (40.7587979, -73.96234270000001),
 'Whittier  California': (33.9708782, -118.0308396),
 'New York  New York': (36.1021436, -115.17449463280128),
 'Mesa  Arizona': (33.4151117, -111.83147920000002),
 'San Jose  California': (37.3361905, -121.890583),
 'Long Beach  CA': (33.7690164, -118.191604),
 'Compton  California': (33.894927, -118.226624),
 'San Francisco  California': (37.779026200000004, -122.4199061),
 'Bandung  Indonesia': (-6.9344694, 107.60495390000001),
 'Yogyakarta  10': (-7.80868595, 110.11477359526589),
 'Makassar': (-5.1342962, 119.41242820000001),
 'Sleman': (-7.6894175, 110.38129042693123),
 'Anaheim  California': (33.8347516, -117.911732),
 'Paramount  CA': (33.898917, -118.17100500000001),
 'Millbrae  California': (37.598958, -122.4009419),
 'Kerman  Iran': (29.571858000000002, 57.301047),
 'Salatiga': (-7.330264200000001, 110.49953529999999),
 'Camp Thorel  Moka  Mauritius': (-20.2165293, 57.6229284),
 'Melrose  Moka  Mauritius': (-20.221007999999998, 57.503026299999995),
 'Salinas  CA': (36.6744117, -121.6550372),
 'Sudbury  Ontario': (46.487489000000004, -80.9921545),
 'Jokjakarta  Yogyakarta  Indonesia': (-7.7956, 110.3695),
 'Wellesley  Massachusetts': (42.2964859, -71.2925571),
 'Boston  New York': (42.6289495, -78.73752890000002),
 'Siantar': (2.9593587, 99.06310429999999),
 'Epsom': (51.3326098, -0.2678212),
 'Bel Air  Flacq  Mauritius': (-20.2559669, 57.75707270000001),
 'Battamban  Batdambang  Cambodia': (13.0957, 103.2022),
 'Phoenix  Arizona': (33.4484, -112.07414170000001),
 'Medina  Saudi Arabia': (24.471153, 39.611121600000004),
 'Desert Hot Springs  CA': (33.961124, -116.5016784),
 'Quetta  02': (30.195767699999998, 67.0172447),
 'Padangsidimpuan  Sumatera Utara  Indonesia': (1.3810981000000002,
  99.27238531444893),
 'Tasikmalaya': (-7.3262484, 108.22011540000001),
 'Waterbury  Connecticut': (41.5538091, -73.0438362),
 'Bantul': (-7.89825435, 110.3855533589862),
 'Venice  California': (33.988334949999995, -118.46152716615019),
 'Semarang  07': (-6.9903987999999995, 110.4229104),
 "Trou D'Eau Douce  Flacq  Mauritius": (-20.239751300000002, 57.784971),
 'Shelton  Connecticut': (41.31561, -73.14016262885858),
 'Singkil  Aceh  Indonesia': (2.2827782, 97.7972072),
 'Ibiza  Islas Baleares  Spain': (38.9743901, 1.4197463178515335),
 'Montebello  CA': (34.0159398, -118.111975),
 'Thimphu  Bhutan': (27.4713546, 89.63367290000001),
 'Lowell  Massachusetts': (42.633424700000006, -71.3161718),
 'Ontario  California': (34.065846, -117.6484304),
 'Port Louis  Mauritius': (-20.1637281, 57.5045331),
 'Bantoel  Yogyakarta  Indonesia': (-7.919, 110.3785),
 'Takengon  Aceh  Indonesia': (4.6230329, 96.84885890000001),
 'Bandar Lampung': (-5.4460713, 105.2643742),
 'Djokjakarta  Yogyakarta  Indonesia': (-7.7956, 110.3695),
 'Centre de Flacq  Mauritius': (-20.1948891, 57.7229652),
 'Cebu City': (10.3095549, 123.89311070000001),
 'Salalah  Oman': (17.014903600000004, 54.095697200000004),
 'Jombang  Jawa Timur  Indonesia': (-7.538452799999999, 112.2379884),
 'Ville De Phnom Penh  Phnum Penh  Cambodia': (11.5564, 104.9282),
 'High Point  North Carolina': (35.9556924, -80.0053176),
 'Gilbert  Arizona': (33.3527632, -111.78903729999999),
 'Poipet': (13.6593962, 102.5708301),
 'Fond du Sac  Mauritius': (-20.046151300000002, 57.58617660000001),
 'Malang': (-7.977120599999999, 112.63402909999999),
 'Middletown  Orange County  New York': (41.4459, -74.4229),
 'Highland Falls  New York': (41.369260499999996, -73.9662504),
 'Santiago De Los Caballeros  Santiago  Dominican Republic': (19.4503932,
  -70.6909047),
 'Siborongborong  Sumatera Utara  Indonesia': (2.2115, 98.9719),
 'Paris  France': (48.856696899999996, 2.3514616000000004),
 'Stanford  California': (37.4265407, -122.17029258409187),
 'Seoul  Korea': (37.5666791, 126.97829140000002),
 'Kudus': (31.79592425, 35.21198075969497),
 'Battambang': (12.9256791, 103.23171364274523),
 'Glendale  Arizona': (33.5386858, -112.18599409999999),
 'Mayaguez  00': (18.201116100000004, -67.1391124),
 'Cilegong  Jawa Barat  Indonesia': (-6.5371, 107.4233),
 'Miskolc  Hungary': (48.1030643, 20.790042899999996),
 'Hoboken  New Jersey': (40.7433066, -74.0323752),
 'Wonosobo  Jawa Tengah  Indonesia': (-7.39979995, 109.92096446847731),
 'New Haven  Connecticut': (41.29843435, -72.93102342707913),
 'Temanggung  Jawa Tengah  Indonesia': (-7.316237700000001, 110.1754504),
 'Scottsdale  Arizona': (33.4942189, -111.92601840000002),
 'Tebing Tinggi': (-1.0205587, 103.0851459),
 'Jeddah  14': (29.9919941, 32.5200945),
 'Dortmund  Germany': (51.5142273, 7.4652789),
 'North Richmond  New South Wales  Australia': (-33.57501129999999,
  150.7056842),
 'Haveli  Punjab  Pakistan': (33.4690855, 72.1187625),
 'Montclair  NJ': (40.8164458, -74.2210643),
 'Corona  California': (33.8752945, -117.56644490000001),
 'Hamilton  Ontario': (43.2560802, -79.8728583),
 'Anaheim  CA': (33.8347516, -117.911732),
 'Santa Clarita  California': (34.3916641, -118.54258600000001),
 'Peoria  Arizona': (33.580611499999996, -112.23729399999999),
 'Manila  Philippines': (14.590733199999999, 120.98096740000001),
 'Kotagede  Yogyakarta  Indonesia': (-7.81831055, 110.39794090974735),
 'Tangerangscheweg  Jawa Barat  Indonesia': (-6.178306, 106.63188899999999),
 'Bajos De Haina  33': (18.4138992, -70.0322971),
 'Surabaya  Indonesia': (-7.2459717, 112.73782659999998),
 'Padang Sidempuan': (1.3810981000000002, 99.27238531444893),
 'Hacienda Heights  California': (33.993067700000005, -117.968675),
 'Ottawa  ON': (45.421106, -75.690308),
 'Cachoeirinha': (-29.9493353, -51.0898927),
 'Quito  Ecuador': (-0.2201641, -78.5123274),
 'Tarutung  Sumatera Utara  Indonesia': (2.0230378, 98.9649283),
 'Cassis  Port Louis  Mauritius': (-20.163524600000002, 57.484449100000006),
 'Mao Adentro  Valverde  Dominican Republic': (19.55, -71.08333),
 'Nagpur': (21.1498134, 79.0820556),
 'Mataram': (-8.5837726, 116.10685),
 'London  Ohio': (39.886449299999995, -83.44825300000001),
 'Bihar Sharif': (25.193859, 85.52086170000001),
 'Sydney  02': (-33.8548157, 151.2164539),
 'Pune  16': (18.4924318, 73.8245738),
 'Coyote  CA': (37.21649229999999, -121.73941740000001),
 'Lahore  04': (31.565682199999998, 74.3141829),
 'Batac  Ilocos Norte': (18.056650899999998, 120.5639598),
 'Klaten  Jawa Tengah  Indonesia': (-7.674920399999999, 110.62727870972591),
 'Ghhanike  Punjab  Pakistan': (32.0547, 74.9296),
 'Santo Domingo  05': (42.1330807, -0.404316),
 'Torrance  CA': (33.835849200000006, -118.34062879999999),
 'Rose Belle  Grand Port  Mauritius': (-20.403103100000003, 57.60241429999999),
 'New London  Connecticut': (41.331255999999996, -72.09643715791684),
 'Montalban  Rizal': (14.7832711, 121.1803945),
 'Covina  CA': (34.08214965, -117.87212381516031),
 'Philadelphia  Pennsylvania': (39.9527237, -75.16352619999999),
 'Guaricano  Distrito Nacional  Dominican Republic': (18.5398, -69.9357),
 'Port Louis  18': (-20.1637281, 57.5045331),
 'Santos  Sao Paulo  Brazil': (-23.960832999999997, -46.333889),
 'Sheffield': (53.38066260000001, -1.4702278),
 'Kampot  Cambodia': (10.6314754, 104.132637),
 'Kwidzynia  Elblag  Poland': (53.7264, 18.9323),
 'San Diego  California': (32.7174202, -117.16277279999998),
 'Garden Grove  California': (33.7746292, -117.9463717),
 'Yonkers  New York': (40.9312099, -73.8987469),
 'Oberhausen  Germany': (51.469613700000004, 6.851443499999999),
 'New Delhi  India': (28.613895399999997, 77.2090057),
 'Quatre Bornes  17': (-20.264301500000002, 57.4800061),
 'Pomona  California': (34.05538129999999, -117.75174960000001),
 'Curup  Bengkulu  Indonesia': (-3.4676178, 102.5301142),
 'Ciamis  Jawa Barat  Indonesia': (-7.326661099999999, 108.3530952),
 'Cikampek  Jawa Barat  Indonesia': (-6.4382950999999995, 107.4670372),
 'Bangalore  India': (12.979119800000001, 77.5912997),
 'Moka  Mauritius': (-20.25229235, 57.58813128201174),
 'Plaine Des Papayes  Pamplemousses  Mauritius': (-20.0633122,
  57.58092070000001),
 'Sidoarjo  Jawa Timur  Indonesia': (-7.4559622, 112.66022171549456),
 'Soerabaya': (52.0907425, 5.099811),
 'Tanjungbalai  Sumatera Utara  Indonesia': (2.9707572, 99.799837),
 'Hemet  California': (33.7475203, -116.97196840000001),
 'Branford  Connecticut': (41.2795414, -72.81509890000001),
 'Multan': (30.1979793, 71.4724978),
 'Islamabad  Pakistan': (33.6938118, 73.0651511),
 'Mulia  35': (5.565613, 95.32384976226393),
 'Cairo  11': (14.3818891, 120.93315079999999),
 'San Pedro  CA': (33.7358518, -118.2922934),
 'Udaipur  Rajasthan': (24.578720999999998, 73.6862571),
 'Stoney Creek  ON': (43.2167722, -79.75675079999999),
 'Semarang  Indonesia': (-6.9903987999999995, 110.4229104),
 'Shinjuku-ku  Tokyo  Japan': (35.69376320000001, 139.7036319),
 'Aberdeen': (57.1482429, -2.0928095),
 'Cruce De Guayacanes  Valverde  Dominican Republic': (19.646160199999997,
  -71.0613473),
 'Barboursville  WV': (38.409529799999994, -82.29459229999999),
 'Mississauga  Ontario': (43.589623100000004, -79.64438790000001),
 'Indrapura  Sumatera Utara  Indonesia': (3.2845415, 99.3694726),
 'Cottage  Riviere Du Rempart  Mauritius': (-20.0639353, 57.635318999999996),
 'Gorontalo': (0.7186174, 122.45559270000001),
 'Moscow  Russia': (55.7504461, 37.6174943),
 'South Kenosha  Wisconsin': (42.584677299999996, -87.82122629999999),
 'Surakarta  Indonesia': (-7.5692462, 110.828448),
 'Montasik  Aceh  Indonesia': (5.4982791, 95.47842327933785),
 'Eugene  Oregon': (44.0505054, -123.09505060000001),
 'Leamington  Ontario': (42.0531166, -82.5996998),
 'Miami  Florida': (25.7741728, -80.19362),
 'Tempe  Arizona': (33.4255056, -111.9400125),
 'Pacoima  CA': (34.262502500000004, -118.427027),
 'Derby  Connecticut': (41.32695200000001, -73.08890597727381),
 'Delhi  07': (28.559238699999998, 77.08275309999999),
 'Kuala Lumpur  Malaysia': (3.1516964, 101.6942371),
 'Banda Aceh  Indonesia': (5.5528455, 95.31929079999999),
 'Curepipe  Mauritius': (-20.3150516, 57.521149699999995),
 'Tonekabon': (36.8154324, 50.8787003),
 'Abuja  31': (9.064330499999999, 7.4892974),
 'American Canyon  California': (38.223457, -122.22704307123536),
 'Kalasan  Yogyakarta  Indonesia': (-7.74710005, 110.46755438020578),
 'Laguboti  Sumatera Utara  Indonesia': (2.9468294999999998, 99.0617426),
 'Kebumen  Jawa Tengah  Indonesia': (-7.6686894, 109.65194650000001),
 'Phoenix  AZ': (33.448436699999995, -112.07414170000001),
 'Pailles Village  Moka  Mauritius': (-20.1898987, 57.48407170000001),
 'Scottsdale  AZ': (33.4942189, -111.92601840000002),
 'Quatre Bornes  Mauritius': (-20.264301500000002, 57.4800061),
 'Lalmatie  Flacq  Mauritius': (-20.1933, 57.6694),
 'Quetta  Pakistan': (30.195767699999998, 67.0172447),
 'Dabadie  Saint George  Trinidad And Tobago': (10.6167, -61.3167),
 'Vaughan  Ontario': (43.7941544, -79.52680229999999),
 'Cilacap  Jawa Tengah  Indonesia': (-7.46202885, 108.80476219169329),
 'Panorama City  CA': (34.2242902, -118.44537450000001),
 'Gardena  CA': (33.8963593, -118.3053037),
 'Cottage Grove  Minnesota': (44.8277446, -92.9438218),
 'Villa Mella  Dominican Republic': (18.531095999999998, -69.9064378),
 'Milton  Ontario': (43.513671, -79.882817),
 'Lake Forest  California': (33.6469261, -117.6859213),
 'Queens  New York': (40.74982429999999, -73.7976337),
 'Sigli  Aceh  Indonesia': (5.381312599999999, 95.957932),
 'Probolinggo': (-7.86250165, 113.30513611585009),
 'Saint Hubert  Grand Port  Mauritius': (-20.3636792, 57.641849300000004),
 'London  ON': (42.9836747, -81.2496068),
 'Bandung Dua  Jawa Barat  Indonesia': (-6.425, 107.4033),
 'San Gabriel  California': (34.09907329999999, -118.10856820000001),
 'Brooklyn  NY': (40.6501038, -73.9495823),
 'San Francisco  CA': (37.779026200000004, -122.4199061),
 'Lagos  Nigeria': (6.4550575, 3.3941795),
 'Batam  Ouham  Central African Republic': (1.1301, 104.0529),
 'Khulna': (22.9372087, 89.2852741),
 'Pasadena  California': (34.14764520000001, -118.14447790000001),
 'Mount Vernon  NY': (40.9125815, -73.8370786),
 'Casablanca  45': (6.0454369, -74.55203709999999),
 'Krapyak  Yogyakarta  Indonesia': (-7.8284775500000015, 110.36224218899294),
 'Batavia  Jawa Barat  Indonesia': (-6.3560403, 106.8293001),
 'Surabaya': (-7.2459717, 112.73782659999998),
 'Surakarta  07': (-7.5692462, 110.828448),
 'Kuta Raja  Aceh  Indonesia': (5.5682386, 95.31432357956051),
 'Duarte  California': (34.1394513, -117.97728729999999),
 'San Clemente  California': (33.4270275, -117.6124179),
 'Palmdale  California': (34.5793131, -118.1171108),
 'Doha  01': (25.2856329, 51.5264162),
 'Rose Hill  Mauritius': (-20.242110800000003, 57.475913),
 'Kalibo Town  Aklan  Philippines': (11.6892, 122.3674),
 'Staten Island  New York': (40.5834557, -74.14960479999999),
 'Santa Clara  California': (37.2333253, -121.68463490000002),
 'Gamping  Yogyakarta  Indonesia': (-7.784314249999999, 110.3322103289057),
 'Fontana  California': (34.0922335, -117.435048),
 'Caracas  Venezuela': (10.506098, -66.9146017),
 'Dallas  Texas': (-23.647594199999997, -46.5101346),
 'Kota  Riau  Indonesia': (-0.3954509, 104.516511),
 'Hayward  CA': (37.6688205, -122.08079640000001),
 'La Flora  Savanne  Mauritius': (-20.4108583, 57.558992599999996),
 'Newark  New Jersey': (40.735657, -74.1723667),
 'Woodland Hills  California': (34.1684364, -118.6058382),
 'London  Ontario': (42.9836747, -81.2496068),
 'Cimanggis  Jawa Barat  Indonesia': (-6.5080002, 106.7797759),
 'Madrid  Spain': (40.416704700000004, -3.7035824999999996),
 'Seoul  11': (14.3878328, 120.93159550000001),
 'Spanish Town  10': (17.9956926, -76.95408309999999),
 'Isabelita  Distrito Nacional  Dominican Republic': (18.7009047, -70.1654584),
 'Corona  CA': (33.8752945, -117.56644490000001),
 'Santa Ana  California': (33.7494951, -117.8732213),
 'Konya  Turkey': (37.8719963, 32.484401500000004),
 'Pasadena  CA': (34.14764520000001, -118.14447790000001),
 'Chandigarh  India': (30.733442100000005, 76.7797143),
 'Pontianak  Indonesia': (-0.0226903, 109.34474879999999),
 'Belle Mare  Flacq  Mauritius': (-20.1978549, 57.7743491),
 'Barrie  Ontario': (44.389311299999996, -79.6901736),
 'Costa Mesa  CA': (33.6633386, -117.90331699999999),
 'Bronx  NY': (40.8466508, -73.87859370000001),
 'Buena Park  California': (33.870413, -117.9962165),
 'Irvine  California': (33.685696899999996, -117.82598190000002),
 'Montreal  Quebec': (45.4972159, -73.61036419999999),
 'Westminster  California': (33.7578725, -117.98590540000001),
 'Nouvelle France  Grand Port  Mauritius': (-20.374633600000003, 57.5681814),
 'Palo Alto  California': (37.44432929999999, -122.15984650000001),
 'Santa Fe Springs  California': (33.9480787, -118.06914990000001),
 'Dudhnai  Assam  India': (25.98202335, 90.8164062850866),
 'Bogor  Indonesia': (-6.596298599999999, 106.79724209999999),
 'Commerce  California': (34.002581, -118.156586),
 'Fredericton  New Brunswick': (45.94795935000001, -66.65336235897576),
 'San Lorenzo  Santa Fe': (-32.743425800000004, -60.74967420000001),
 'Bogor  30': (-6.596298599999999, 106.79724209999999),
 'Oakland  California': (37.804455700000005, -122.27135630000001),
 'Saint Domingue  Distrito Nacional  Dominican Republic': (18.4801972,
  -69.942111),
 'Pangkalanbrandan  Sumatera Utara  Indonesia': (4.0225925, 98.2904379),
 'Solo City': (-7.5692462, 110.828448),
 'Montebello  California': (34.0159398, -118.111975),
 'Karachi  Pakistan': (24.8667795, 67.0311286),
 'North Highlands  California': (38.671097499999995, -121.3879538180464),
 'Mill Valley  California': (37.906036799999995, -122.5449763),
 'Sydney  Australia': (-33.8548157, 151.2164539),
 'Lakewood  California': (33.8503463, -118.11719140000001),
 'Argy  Flacq  Mauritius': (-20.1917948, 57.735224),
 'Villa Consuelo': (18.4842583, -69.89988699999999),
 'Placentia  California': (33.8714814, -117.8617337),
 'Fort Lauderdale  Florida': (26.122308399999998, -80.14337859999999),
 'Guangzhou  China': (23.1301964, 113.25929450000001),
 'Rochester  New York': (43.157284999999995, -77.615214),
 "Bat'Umi  Ajaria  Georgia": (41.6168, 41.6367),
 'Bandung  30': (-6.9344694, 107.60495390000001),
 'Liverpool  H8': (5.3763241, -74.9170527),
 'Trou Aux Biches  Pamplemousses  Mauritius': (-20.0362961,
  57.544703299999995),
 'Petit Raffray  Riviere Du Rempart  Mauritius': (-20.019466100000002,
  57.6261351),
 'Burlington  Ontario': (43.3248924, -79.7966835),
 'Lynwood  California': (33.924831, -118.2024154),
 'Kabanjahe': (3.1009104, 98.48912179999999),
 'Soroako': (-2.5624691000000004, 121.3993361),
 'Rancho Palos Verdes  California': (33.7483311, -118.3707683),
 'Banyuwangi  Jawa Timur  Indonesia': (-8.2094973, 114.37372009999999),
 'Balige  Sumatera Utara  Indonesia': (2.3334951, 99.06676209999999),
 'Redondo Beach  California': (33.8455911, -118.38867659999998),
 'Depok Dua Timur  Jawa Barat  Indonesia': (-6.3853, 106.8473),
 'San Felipe de Puerto Plata': (19.7893977, -70.6940515),
 'Jayapura  Indonesia': (-2.5387539, 140.7037389),
 'Simi Valley  California': (34.2677404, -118.7538071),
 'Porsea  Sumatera Utara  Indonesia': (2.4447878, 99.1560688),
 'Roches Noires  Riviere Du Rempart  Mauritius': (-20.1088905, 57.7185331),
 'Zhonghe  Heilongjiang  China': (45.4461394, 127.46932269999999)}


In [43]:
df_comb['biker_lat'] = df_comb['area'].apply(lambda x : dict1[x][0] if x in dict1.keys() else np.nan)
df_comb['biker_long'] = df_comb['area'].apply(lambda x : dict1[x][1] if x in dict1.keys() else np.nan)

### Test

In [44]:
area2 = pd.unique(df1_comb['area'])

array(['Phnom Penh', 'Magelang', 'Toronto  Ontario',
       'Los Angeles  California', 'Medan  Indonesia', nan,
       'Lasem  Jawa Tengah  Indonesia', 'Yogyakarta',
       'Jombang  Jawa Timur  Indonesia',
       'Cengkareng  Jakarta Raya  Indonesia', 'Manila  Philippines',
       'Korea  Puerto Rico', 'Phnom Penh  11', 'Sibolga',
       'Ottawa  Ontario', 'Oshawa  Ontario', 'DKI Jakarta',
       'Addis Ababa  Ethiopia', 'Pematangsiantar',
       'Purworejo  Jawa Tengah  Indonesia', '  ', 'Kabanjahe',
       'Santo Domingo  05', 'Kupang  18', 'Surabaya  Indonesia',
       'Glendale  Arizona', 'Flacq  Flacq  Mauritius', 'Tangerang',
       'Malang', 'Tempe  Arizona', 'Medan  26', 'Bandar Lampung',
       'Los Angeles  CA', 'Compton  California',
       'San Francisco  California', 'Palermo  Italy', 'Bronx  NY',
       'Bekasi', 'Padangsidimpuan  Sumatera Utara  Indonesia',
       'Santo Domingo  Dominican Republic',
       'Jokjakarta  Yogyakarta  Indonesia',
       'Bumiayu  Jawa Teng

In [46]:
dict2 = {'Phnom Penh': (11.568271, 104.9224426),
 'Magelang': (-7.5136132, 110.21433030848786),
 'Toronto  Ontario': (43.6534817, -79.3839347),
 'Los Angeles  California': (34.0536909, -118.242766),
 'Medan  Indonesia': (3.5896654, 98.6738261),
 'Lasem  Jawa Tengah  Indonesia': (-6.6981337, 111.4476269),
 'Yogyakarta': (-7.9778383999999996, 110.36722565020224),
 'Jombang  Jawa Timur  Indonesia': (-7.5384528, 112.2379884),
 'Cengkareng  Jakarta Raya  Indonesia': (-6.1490933, 106.734781),
 'Manila  Philippines': (14.5907332, 120.9809674),
 'Phnom Penh  11': (11.568271, 104.9224426),
 'Sibolga': (1.736957, 98.78461),
 'Ottawa  Ontario': (45.421106, -75.690308),
 'Oshawa  Ontario': (43.8975558, -78.8635324),
 'DKI Jakarta': (-6.1753942, 106.827183),
 'Addis Ababa  Ethiopia': (9.0107934, 38.7612525),
 'Pematangsiantar': (3.5741592, 98.6884926),
 'Purworejo  Jawa Tengah  Indonesia': (-7.7073021, 109.96651169154023),
 'Kabanjahe': (3.1009104, 98.4891218),
 'Santo Domingo  05': (42.1330807, -0.404316),
 'Kupang  18': (-10.1632209, 123.6017755),
 'Surabaya  Indonesia': (-7.2459717, 112.7378266),
 'Glendale  Arizona': (33.5386858, -112.1859941),
 'Flacq  Flacq  Mauritius': (-20.1948891, 57.7229652),
 'Tangerang': (-6.1760311, 106.6384468),
 'Malang': (-7.9771206, 112.6340291),
 'Tempe  Arizona': (33.4255056, -111.9400125),
 'Medan  26': (31.3297797, 31.7165375),
 'Bandar Lampung': (-5.4460713, 105.2643742),
 'Los Angeles  CA': (34.0536909, -118.242766),
 'Compton  California': (33.894927, -118.226624),
 'San Francisco  California': (37.7790262, -122.4199061),
 'Palermo  Italy': (38.1112268, 13.3524434),
 'Bronx  NY': (40.8466508, -73.8785937),
 'Bekasi': (-6.2349858, 106.9945444),
 'Padangsidimpuan  Sumatera Utara  Indonesia': (1.3810981, 99.27238531444891),
 'Santo Domingo  Dominican Republic': (18.4801972, -69.942111),
 'Bumiayu  Jawa Tengah  Indonesia': (-7.2372672, 109.0099015),
 'Purwokerto  Jawa Tengah  Indonesia': (-7.4279011, 109.2408501),
 'Kampala  Uganda': (0.3177137, 32.5813539),
 'Bronx  New York': (40.8466508, -73.8785937),
 'Siem Reap': (13.3617562, 103.8590321),
 'Irvine  California': (33.6856969, -117.8259819),
 'Atlanta  Georgia': (33.7489924, -84.3902644),
 'Sebastopol  Flacq  Mauritius': (-20.2909197, 57.6891632),
 'Jogjakarta  Indonesia': (-7.8011945, 110.364917),
 'Toronto  ON': (43.6534817, -79.3839347),
 'Ungaran  Jawa Tengah  Indonesia': (-7.1327721, 110.4047238),
 'New York  New York': (36.1021436, -115.17449463280128),
 'Santiago De Los Caballeros  Santiago  Dominican Republic': (19.4503932,
  -70.6909047),
 'Phnom Pen  Phnum Penh  Cambodia': (11.5757985, 104.9179179),
 'Bel Air  Flacq  Mauritius': (-20.2559669, 57.7570727),
 'Alexandria  Egypt': (31.199004, 29.894378),
 'Manchester  United Kingdom': (53.4794892, -2.2451148),
 'Jakarta  04': (-6.1753942, 106.827183),
 'Battambang': (12.9256791, 103.23171364274523),
 'Jakarta  Indonesia': (-6.1753942, 106.827183),
 'Singkil  Aceh  Indonesia': (2.2827782, 97.7972072),
 'Djelfa  22': (34.342841, 3.217253079090331),
 'Tucson  Arizona': (32.2228765, -110.9748477),
 'South El Monte  CA': (34.0519548, -118.0467339),
 'Anaheim  California': (33.8347516, -117.911732),
 'South Gate  California': (33.9463456, -118.200981),
 'Blora  Jawa Tengah  Indonesia': (-6.9691528, 111.4130521),
 'Bellflower  CA': (33.8825705, -118.1167679),
 'Ciamis  Jawa Barat  Indonesia': (-7.3266611, 108.3530952),
 'Palembang  Indonesia': (-2.9888297, 104.756857),
 'Siantar': (2.9593587, 99.0631043),
 'New Haven  CT': (22.2667238, 114.19238192419849),
 'Vancouver  British Columbia': (49.2608724, -123.1139529),
 'Torrance  CA': (33.8358492, -118.3406288),
 'Altadena  CA': (34.1863161, -118.1352329),
 'Surabaya': (-7.2459717, 112.7378266),
 'Bajos De Haina  San Cristobal  Dominican Republic': (18.432172700000002,
  -70.0310060487165),
 'Jeddah  14': (29.9919941, 32.5200945),
 'Palangkaraya': (-2.2072919, 113.9164372),
 'Ottawa  ON': (45.421106, -75.690308),
 'Bandung  Indonesia': (-6.9344694, 107.6049539),
 'Orleans  Ontario': (45.40683715, -75.52745676171389),
 'Pekanbaru': (0.5262455, 101.4515727),
 'Triolet  Mauritius': (-20.0574743, 57.5521877),
 'Rajapolah  Jawa Barat  Indonesia': (-7.2417512, 108.1844423),
 'Amsterdam  Netherlands': (52.3727598, 4.8936041),
 'Boston  Massachusetts': (42.3602534, -71.0582912),
 "Trou D'Eau Douce  Flacq  Mauritius": (-20.2397513, 57.784971),
 'Mumbai  16': (19.0434279, 72.824326),
 'Mississauga  Ontario': (43.5896231, -79.6443879),
 'Miami  Florida': (25.7741728, -80.19362)}

88

In [47]:
dict2['Korea  Puerto Rico'] = (18.2208, 66.5901)
dict2['Jokjakarta  Yogyakarta  Indonesia'] = (-7.7956, 110.3695)
dict2['Djokja  Yogyakarta  Indonesia'] = (-7.7956, 110.3695)
dict2['Hargesa  Woqooyi Galbeed  Somalia'] = (9.5624, 44.0770)

In [49]:
df1_comb['biker_lat'] = df1_comb['area'].apply(lambda x : dict2[x][0] if x in dict2.keys() else np.nan)
df1_comb['biker_long'] = df1_comb['area'].apply(lambda x : dict2[x][1] if x in dict2.keys() else np.nan)

### Distance Calculation

In [34]:
def get_distance(biker_lat, biker_long, lat, long):
    if 'nan' not in [str(biker_lat), str(biker_long), str(lat), str(long)]:
        return geopy.distance.geodesic((biker_lat, biker_long),(lat, long)).km        
    else:
        return np.nan

#### Train

In [35]:
df_comb['distance'] = df_comb.apply(lambda x: get_distance(x['biker_lat'], x['biker_long'], x['latitude'], x['longitude']), axis = 1)

#### Test

In [36]:
df1_comb['distance'] = df1_comb.apply(lambda x: get_distance(x['biker_lat'], x['biker_long'], x['latitude'], x['longitude']), axis = 1)

### Time_Zone

*An additional feature tour zone is computed in order to estimate the zone difference between biker and tour locations*

In [44]:
def get_timezone(x):
    try:
        return int(x*4)
    except:
        return np.nan

In [46]:
df_comb['tour_zone'] = df_comb['longitude'].apply(lambda x: get_timezone(x))
df1_comb['tour_zone'] = df1_comb['longitude'].apply(lambda x: get_timezone(x))

In [47]:
df_comb['time_diff'] = df_comb.apply(lambda x: x['tour_zone'] - x['time_zone'], axis = 1)
df1_comb['time_diff'] = df1_comb.apply(lambda x: x['tour_zone'] - x['time_zone'], axis = 1)

### Invited, Maybe, going, not_going

*3 additional features Invited , maybe and going are computed which consist of the number of friends of the biker (extracted from bikers_network) in going,maybe and not going given in tour_convoy*

#### Train

In [53]:
def going_not_maybe_invited(biker_id, tour_id):
    friends = bikers_network[bikers_network['biker_id'] == biker_id].friends.str.split(' ').to_list()[0]
    going = tour_convoy[tour_convoy['tour_id'] == tour_id].going.str.split(' ').to_list()[0]
    not_going = tour_convoy[tour_convoy['tour_id'] == tour_id].not_going.str.split(' ').to_list()[0]
    invited = tour_convoy[tour_convoy['tour_id'] == tour_id].invited.str.split(' ').to_list()[0]
    maybe = tour_convoy[tour_convoy['tour_id'] == tour_id].maybe.str.split(' ').to_list()[0]
    g = []
    n = []
    m = []
    i = []
    gg = 0
    ng = 0
    inv = 0
    myb = 0
    if str(friends) != 'nan':
        if str(going) != 'nan':
            g = set(friends).intersection(going)
            gg = len(going)
        if str(not_going) != 'nan':
            n = set(friends).intersection(not_going)
            ng = len(not_going)
        if str(maybe) != 'nan':
            m = set(friends).intersection(maybe)
            myb = len(maybe)
        if str(invited) != 'nan':
            i = set(friends).intersection(invited)
            inv = len(invited)
    
    return (len(g),len(n),len(m),len(i), gg, ng, myb, inv)  

In [54]:
df_comb['list'] = df_comb.apply(lambda x: going_not_maybe_invited(x['biker_id_x'], x['tour_id']), axis = 1)
df_comb['going'] = df_comb.apply(lambda x: x['list'][0], axis = 1)
df_comb['not_going'] = df_comb.apply(lambda x: x['list'][1], axis = 1)
df_comb['maybe'] = df_comb.apply(lambda x: x['list'][2], axis = 1)
df_comb['invite'] = df_comb.apply(lambda x: x['list'][3], axis = 1)
df_comb['tot_going'] = df_comb.apply(lambda x: x['list'][4], axis = 1)
df_comb['tot_not_going'] = df_comb.apply(lambda x: x['list'][5], axis = 1)
df_comb['tot_maybe'] = df_comb.apply(lambda x: x['list'][6], axis = 1)
df_comb['tot_invite'] = df_comb.apply(lambda x: x['list'][7], axis = 1)
df_comb.drop(columns = ['list'], inplace = True)

#### Test

In [60]:
df1_comb['list'] = df1_comb.apply(lambda x: going_not_maybe_invited(x['biker_id_x'], x['tour_id']), axis = 1)
df1_comb['going'] = df1_comb.apply(lambda x: x['list'][0], axis = 1)
df1_comb['not_going'] = df1_comb.apply(lambda x: x['list'][1], axis = 1)
df1_comb['maybe'] = df1_comb.apply(lambda x: x['list'][2], axis = 1)
df1_comb['invite'] = df1_comb.apply(lambda x: x['list'][3], axis = 1)
df1_comb['tot_going'] = df1_comb.apply(lambda x: x['list'][4], axis = 1)
df1_comb['tot_not_going'] = df1_comb.apply(lambda x: x['list'][5], axis = 1)
df1_comb['tot_maybe'] = df1_comb.apply(lambda x: x['list'][6], axis = 1)
df1_comb['tot_invite'] = df1_comb.apply(lambda x: x['list'][7], axis = 1)
df1_comb.drop(columns = ['list'], inplace = True)

### No of Friends

*This features represents the number of friends of the biker*

In [61]:
def get_count(x):
    if x['friends'] == x['friends']:
        return len(x['friends'].split())
    return 0

In [62]:
bikers_network['num'] = bikers_network.apply(lambda x: get_count(x), axis = 1)

In [64]:
df_comb['num_frnd'] = df_comb['biker_id_x'].apply(lambda x: int(bikers_network[bikers_network['biker_id'] == x]['num']))
df1_comb['num_frnd'] = df1_comb['biker_id_x'].apply(lambda x: int(bikers_network[bikers_network['biker_id'] == x]['num']))

### Impute Days and Month

In [67]:
import datetime  
from datetime import date 
import calendar 
  
def findDay(date): 
    day, month, year = (int(i) for i in date.split('-'))     
    x = datetime.date(year, month, day) 
    return x.strftime("%A")

def findDay1(date): 
    day, month, year = (int(i) for i in date.split(' ')[0].split('-'))     
    x = datetime.date(year, month, day) 
    return x.strftime("%A")

def findMonth(date):
    b = date.split('-')[1]
    return int(b)

def findMonth1(date):
    b = date.split(' ')[0].split('-')[1]
    return int(b)

def findDate(date):
    b = date.split('-')[0]
    return int(b)

def findDate1(date):
    b = date.split(' ')[0].split('-')[0]
    return int(b)

## Train

In [68]:
df_comb['tour_month'] = df_comb['tour_date'].apply(lambda x: findMonth(x))
df_comb['invite_month'] = df_comb['timestamp'].apply(lambda x: findMonth1(x))
df_comb['mem_month'] = df_comb['member_since'].apply(lambda x: findMonth(x))

In [69]:
df_comb['tour_date1'] = df_comb['tour_date'].apply(lambda x: findDate(x))
df_comb['invite_date'] = df_comb['timestamp'].apply(lambda x: findDate1(x))

In [70]:
df_comb['tour_day'] = df_comb['tour_date'].apply(lambda x: findDay(x))
df_comb['invite_day'] = df_comb['timestamp'].apply(lambda x: findDay1(x))

## Test

In [71]:
df1_comb['tour_month'] = df1_comb['tour_date'].apply(lambda x: findMonth(x))
df1_comb['invite_month'] = df1_comb['timestamp'].apply(lambda x: findMonth1(x))
df1_comb['mem_month'] = df1_comb['member_since'].apply(lambda x: findMonth(x))

In [72]:
df1_comb['tour_date1'] = df1_comb['tour_date'].apply(lambda x: findDate(x))
df1_comb['invite_date'] = df1_comb['timestamp'].apply(lambda x: findDate1(x))

In [73]:
df1_comb['tour_day'] = df1_comb['tour_date'].apply(lambda x: findDay(x))
df1_comb['invite_day'] = df1_comb['timestamp'].apply(lambda x: findDay1(x))

### Time difference between invite date and tour rate

In [83]:
from datetime import datetime

def get_time(x, var1, var2): 
    then = 0
    now = 0
    if var1 == 'timestamp':
        day, month, yr = map(int, x[var1].split(' ')[0].split('-'))
        hr, mins, sec = map(int, x[var1].split(' ')[1].split(':'))
        then = datetime(yr, month, day, hr, mins, sec)
    
    else:
        day, month, yr = map(int, x[var1].split('-'))
        then = datetime(yr, month, day)
        
    dd, mm, yy = map(int, x[var2].split('-'))
    now = datetime(yy, mm, dd)
    duration = now - then                        
    duration_in_s = duration.total_seconds() 
    return duration_in_s

### Train

In [84]:
df_comb['time_bw_invite_tour'] = df_comb.apply(lambda x: get_time(x, 'timestamp', 'tour_date'), axis = 1)
df_comb['time_bw_invite_mem'] = df_comb.apply(lambda x: get_time(x, 'timestamp', 'member_since'), axis = 1)
df_comb['time_bw_tour_mem'] = df_comb.apply(lambda x: get_time(x, 'member_since', 'tour_date'), axis = 1)

### Test

In [85]:
df1_comb['time_bw_invite_tour'] = df1_comb.apply(lambda x: get_time(x,'timestamp', 'tour_date'), axis = 1)
df1_comb['time_bw_invite_mem'] = df1_comb.apply(lambda x: get_time(x, 'timestamp', 'member_since'), axis = 1)
df1_comb['time_bw_tour_mem'] = df1_comb.apply(lambda x: get_time(x,'member_since', 'tour_date'), axis = 1)

### Word Count and Ratio

*An additional feature called total_ws is created which consists of the sum of all 100 w's.Also the ratio of total_ws and w_other is taken as an other feature*

In [88]:
index_w = df_comb.columns.get_loc('w1')
df_comb['total_ws'] = df_comb.apply(lambda x: x.iloc[index_w:index_w + 100].sum(), axis = 1)

In [89]:
df1_comb['total_ws'] = df1_comb.apply(lambda x: x.iloc[index_w:index_w + 100].sum(), axis = 1)

In [92]:
def get_word_ratio(x):
    try:
        return x['total_ws']/x['w_other']
    except:
        return np.nan

In [93]:
df_comb['w_ratio'] = df_comb.apply(lambda x: get_word_ratio(x), axis = 1)
df1_comb['w_ratio'] = df1_comb.apply(lambda x: get_word_ratio(x), axis = 1)

### Friendship with Organizer

*This features checks the relation between the biker and the tour organizer.If the biker is a friend of the organizer we treat it as 1 else we treat it as 0*

In [96]:
def is_biker_friend(x):
    try:
        if int(bikers_network[bikers_network['biker_id'] == x['biker_id_x']]['friends'].str.contains(x['biker_id_y'])):
            return 1
        else:
            return 0
    except:
        return 0    

In [97]:
df_comb['is_friend'] = df_comb.apply(lambda x: is_biker_friend(x), axis = 1)
df1_comb['is_friend'] = df1_comb.apply(lambda x: is_biker_friend(x), axis = 1)

### Age Calculation

*The age of the biker on the day of the tour is represented through this feature*

In [98]:
def get_age(year, date):
    day, month, year1 = (int(i) for i in date.split('-')) 
    try:
        return int(year1) - int(year)
    except:
        return np.nan

In [99]:
df_comb['age'] = df_comb.apply(lambda x: get_age(x['bornIn'], x['tour_date']), axis = 1)

Unnamed: 0,biker_id_x,tour_id,invited,timestamp,language_id,location_id,bornIn,gender,member_since,area,time_zone,biker_id_y,tour_date,city,state,pincode,country,latitude,longitude,w1,w2,w3,w4,w5,w6,w7,w8,w9,w10,w11,w12,w13,w14,w15,w16,w17,w18,w19,w20,w21,w22,w23,w24,w25,w26,w27,w28,w29,w30,w31,w32,w33,w34,w35,w36,w37,w38,w39,w40,w41,w42,w43,w44,w45,w46,w47,w48,w49,w50,w51,w52,w53,w54,w55,w56,w57,w58,w59,w60,w61,w62,w63,w64,w65,w66,w67,w68,w69,w70,w71,w72,w73,w74,w75,w76,w77,w78,w79,w80,w81,w82,w83,w84,w85,w86,w87,w88,w89,w90,w91,w92,w93,w94,w95,w96,w97,w98,w99,w100,w_other,biker_lat,biker_long,distance,tour_zone,time_diff,going,not_going,maybe,invite,tot_going,tot_not_going,tot_maybe,tot_invite,num_frnd,tour_month,invite_month,mem_month,tour_date1,invite_date,tour_day,invite_day,time_bw_invite_tour,time_bw_invite_mem,time_bw_tour_mem,total_ws,w_ratio,is_friend,age
0,DA44012,QY18771225,0,02-10-2012 15:53:05,id,ID,1990,male,02-10-2012,Binjai,480.0,EB06419938,03-10-2012,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3.606396,98.489987,,,,0,3,0,3,8,25,2,23,862,10,10,10,3,2,Wednesday,Tuesday,29215.0,-57185.0,86400.0,0,0.0,0,22.0
1,DA44012,QU02284248,0,02-10-2012 15:53:05,id,ID,1990,male,02-10-2012,Binjai,480.0,CA16654644,03-10-2012,Yogyakarta,,,Indonesia,-7.767,110.363,2,0,0,0,2,3,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24,3.606396,98.489987,1822.247295,441.0,-39.0,0,0,0,1,10,2,6,122,862,10,10,10,3,2,Wednesday,Tuesday,29215.0,-57185.0,86400.0,14,0.583333,0,22.0
2,DA44012,RU29072432,0,02-10-2012 15:53:05,id,ID,1990,male,02-10-2012,Binjai,480.0,DG39934255,26-10-2012,Medan,,,Indonesia,3.567,98.65,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,37,3.606396,98.489987,18.30394,394.0,-86.0,0,0,0,2,212,137,154,3844,862,10,10,10,26,2,Friday,Tuesday,2016415.0,-57185.0,2073600.0,9,0.243243,0,22.0
3,DA44012,SP72478280,0,02-10-2012 15:53:05,id,ID,1990,male,02-10-2012,Binjai,480.0,JH461525,06-10-2012,,,,,34.017,71.583,1,2,1,1,0,0,2,0,0,1,1,1,2,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,0,0,0,0,2,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,3,0,56,3.606396,98.489987,4371.066216,286.0,-194.0,1,0,0,0,8,0,6,9,862,10,10,10,6,2,Saturday,Tuesday,288415.0,-57185.0,345600.0,30,0.535714,0,22.0
4,DA44012,QS90707377,0,02-10-2012 15:53:05,id,ID,1990,male,02-10-2012,Binjai,480.0,DG39934255,06-10-2012,Medan,,,Indonesia,3.607,98.653,2,0,0,0,0,4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,79,3.606396,98.489987,18.111001,394.0,-86.0,0,0,0,2,83,55,65,3814,862,10,10,10,6,2,Saturday,Tuesday,288415.0,-57185.0,345600.0,12,0.151899,0,22.0


In [100]:
df1_comb['age'] = df1_comb.apply(lambda x: get_age(x['bornIn'], x['tour_date']), axis = 1)

Unnamed: 0,biker_id_x,tour_id,invited,timestamp,language_id,location_id,bornIn,gender,member_since,area,time_zone,biker_id_y,tour_date,city,state,pincode,country,latitude,longitude,w1,w2,w3,w4,w5,w6,w7,w8,w9,w10,w11,w12,w13,w14,w15,w16,w17,w18,w19,w20,w21,w22,w23,w24,w25,w26,w27,w28,w29,w30,w31,w32,w33,w34,w35,w36,w37,w38,w39,w40,w41,w42,w43,w44,w45,w46,w47,w48,w49,w50,w51,w52,w53,w54,w55,w56,w57,w58,w59,w60,w61,w62,w63,w64,w65,w66,w67,w68,w69,w70,w71,w72,w73,w74,w75,w76,w77,w78,w79,w80,w81,w82,w83,w84,w85,w86,w87,w88,w89,w90,w91,w92,w93,w94,w95,w96,w97,w98,w99,w100,w_other,biker_lat,biker_long,distance,tour_zone,time_diff,going,not_going,maybe,invite,tot_going,tot_not_going,tot_maybe,tot_invite,num_frnd,tour_month,invite_month,mem_month,tour_date1,invite_date,tour_day,invite_day,time_bw_invite_tour,time_bw_invite_mem,time_bw_tour_mem,total_ws,w_ratio,is_friend,age
0,CG33145288,QX16813281,0,01-11-2012 10:14:42,en,US,1993,male,01-11-2012,Phnom Penh,420.0,BB12186589,10-11-2012,,,,,11.529,104.931,7,3,7,1,3,4,2,2,0,3,2,3,5,1,1,0,1,0,1,0,0,2,1,0,0,0,0,1,1,0,0,0,0,0,2,0,1,0,0,4,0,0,0,0,0,0,1,0,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,2,0,1,0,0,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,81,11.568271,104.922443,4.443266,419.0,-1.0,0,0,2,23,223,87,180,2521,2527,11,11,11,10,1,Saturday,Thursday,740718.0,-36882.0,777600.0,74,0.91358,0,19.0
1,CG33145288,QR69035551,0,01-11-2012 10:14:08,en,US,1993,male,01-11-2012,Phnom Penh,420.0,HA0933835,01-02-2013,Phnom Penh,,,Cambodia,11.569,104.914,2,1,3,0,0,0,2,1,3,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,3,0,0,2,1,0,3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,36,11.568271,104.922443,0.924382,419.0,-1.0,7,0,5,118,510,0,384,9094,2527,2,11,11,1,1,Friday,Thursday,7911952.0,-36848.0,7948800.0,33,0.916667,0,20.0
2,CG33145288,VW3098017,0,01-11-2012 10:14:08,en,US,1993,male,01-11-2012,Phnom Penh,420.0,DC74062122,04-11-2012,Phnom Penh,,,Cambodia,11.551,104.929,0,0,2,0,0,2,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,30,11.568271,104.922443,2.039999,419.0,-1.0,0,1,0,16,20,7,6,454,2527,11,11,11,4,1,Sunday,Thursday,222352.0,-36848.0,259200.0,12,0.4,0,19.0
3,EC61865653,RR14608095,0,01-11-2012 02:14:15,id,ID,1993,male,01-11-2012,Magelang,540.0,EG82098,04-11-2012,,,,,,,0,0,0,0,2,3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,84,-7.513613,110.21433,,,,0,0,1,3,382,67,135,2927,847,11,11,11,4,1,Sunday,Thursday,251145.0,-8055.0,259200.0,10,0.119048,0,19.0
4,CG33145288,RP07279414,0,01-11-2012 10:14:08,en,US,1993,male,01-11-2012,Phnom Penh,420.0,HA0933835,28-11-2013,Phnom Penh,,,Cambodia,11.569,104.914,1,0,4,1,1,0,0,1,0,2,0,1,3,0,0,2,0,0,0,0,0,1,0,2,2,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1,0,0,0,1,0,0,1,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,28,11.568271,104.922443,0.924382,419.0,-1.0,5,0,4,121,571,0,348,9064,2527,11,11,11,28,1,Thursday,Thursday,33831952.0,-36848.0,33868800.0,34,1.214286,0,20.0


### weekend

*This feature gives us information ,if the day of the tour is a weekday or a weekend . Here weekends are grouped as 1 and weekdays are grouped as 0*

In [101]:
weekend = {'Monday' : 0, 'Tuesday': 0, 'Wednesday': 0, 'Thursday' : 0, 'Friday': 1, 'Saturday': 1, 'Sunday': 1}
df_comb['weekend'] = df_comb['tour_day'].map(weekend)
df1_comb['weekend'] = df1_comb['tour_day'].map(weekend)

In [107]:
not_req = ['biker_id_x', 'tour_id','timestamp','member_since', 'area','biker_id_y', 'tour_date', 'city', 'state', 'pincode', 'country', 'latitude', 'longitude', 'biker_lat', 'biker_long']

In [113]:
df_comb.drop(columns = not_req, inplace = True)
df1_comb.drop(columns = not_req, inplace = True)

In [127]:
df1_comb['bornIn'] = df1_comb['bornIn'].astype('float64')

In [130]:
df_comb.gender.fillna(df_comb.gender.mode()[0], inplace = True)
df1_comb.gender.fillna(df1_comb.gender.mode()[0], inplace = True)

# Training the preprocessed data 

In [4]:
# splitting the data into train and validation sets
train_features, val_features, train_labels, val_labels = train_test_split(df_comb , output ,train_size=0.8, test_size=0.2, random_state=0)

# Cat Boost

In [5]:
cat_features = [1, 2, 4, 124, 125]

train_label = train_labels
eval_label = val_labels


train_dataset = Pool( data = train_features,
                      label = train_label,
                      cat_features = cat_features
                )

eval_dataset = Pool(data = val_features,
                    label = eval_label,
                     cat_features = cat_features)

# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations = 6000,
                           learning_rate = 0.01,
                           depth = 8,
                           l2_leaf_reg = 10,
                           loss_function = 'Logloss',
                           verbose = 0,
                           eval_metric = 'AUC')
         
model.fit(train_dataset,plot=False,eval_set=eval_dataset)

<catboost.core.CatBoostClassifier at 0x1fd03f0e1c0>

In [6]:
caty = model.predict_proba(df1_comb)             
caty = pd.DataFrame(caty)

In [7]:
submit = test[['biker_id','tour_id']]
submit['like'] = caty.iloc[:,1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submit['like'] = caty.iloc[:,1]


### converting the results into submission format

In [8]:
pred_me = submit.groupby('biker_id')
biker = pd.unique(submit['biker_id'])

In [9]:
final = {}
for j in biker:
    tours = pred_me.get_group(j)
    order = {}
    for i,row in tours.iterrows():
        order[row['tour_id']] = row['like']
    x = sorted(order.items(), key = lambda kv:(kv[1], kv[0]))
    x = [i[0] for i in x]
    x.reverse()
    x = ' '.join(x)
    final[j] = x

In [10]:
sunil = pd.DataFrame(final.items(), columns=['biker_id', 'tour_id'])
sunil = sunil.set_index(['biker_id'])

In [11]:
sunil.to_csv('CE18B057_CE18B125_1.csv')

# Lightgbm

In [12]:
import lightgbm as lgb
from sklearn.metrics import *

In [13]:
for c in df_comb.columns:
    col_type = df_comb[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        df_comb[c] = df_comb[c].astype('category')

In [14]:
x_train1 ,x_val1, y_train1, y_val1 = train_test_split(df_comb, output,train_size=0.8, test_size=0.2, random_state=0)

In [15]:
for c in df1_comb.columns:
    col_type = df1_comb[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        df1_comb[c] = df1_comb[c].astype('category')

In [16]:
lgb_params = {'learning_rate': 0.013410208385475693,
 'n_estimators': 428,
 'num_leaves': 180,
 'metric': 'auc'}


In [17]:
lgbm_new = lgb.LGBMClassifier(**lgb_params)
lgbm_new.fit(x_train1 ,y_train1)

LGBMClassifier(learning_rate=0.013410208385475693, metric='auc',
               n_estimators=428, num_leaves=180)

In [19]:
bayes_lgb = lgbm_new.predict_proba(df1_comb)
bayes_lgb = pd.DataFrame(bayes_lgb)

# Ensembling Catboost and Lightgbm

In [21]:
submit['like'] = bayes_lgb.iloc[:,1]*0.45 + caty.iloc[:,1]*0.55

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submit['like'] = bayes_lgb.iloc[:,1]*0.45 + caty.iloc[:,1]*0.55


In [22]:
pred_me = submit.groupby('biker_id')
biker = pd.unique(submit['biker_id'])

In [23]:
final = {}
for j in biker:
    tours = pred_me.get_group(j)
    order = {}
    for i,row in tours.iterrows():
        order[row['tour_id']] = row['like']
    x = sorted(order.items(), key = lambda kv:(kv[1], kv[0]))
    x = [i[0] for i in x]
    x.reverse()
    x = ' '.join(x)
    final[j] = x

In [24]:
sunil = pd.DataFrame(final.items(), columns=['biker_id', 'tour_id'])
sunil = sunil.set_index(['biker_id'])

In [25]:
sunil.to_csv('CE18B057_CE18B125_2.csv')