In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

In [None]:
club = pd.read_csv("train.csv")
club_test = pd.read_csv("test.csv")

# Step 1 : Data checkin and null value analysis:

In [None]:
club.head(5)

In [None]:
club.shape

In [None]:
club.info()

# checking the distribution of numerical data for outliers:

In [None]:
club.describe()

In [None]:
club.columns

In [None]:
club.dtypes

In [None]:
club_test.head(5)

In [None]:
club_test.shape

# there is some categorical data so we would need to do some dummy variable creation

# Step 2 : New features creation for analysis:

In [None]:
club['booking_date']= pd.to_datetime(club['booking_date'], dayfirst=True)
club['booking_date'] =club['booking_date'].apply(lambda x: '20'+str(x.year) if len(str(x.year))!=4 else x)

In [None]:
club['checkin_date']= pd.to_datetime(club['checkin_date'], dayfirst=True)
club['checkin_date'] =club['checkin_date'].apply(lambda x: '20'+str(x.year) if len(str(x.year))!=4 else x)

In [None]:
club['checkout_date']= pd.to_datetime(club['checkout_date'], dayfirst=True)
club['checkout_date'] =club['checkout_date'].apply(lambda x: '20'+str(x.year) if len(str(x.year))!=4 else x)

In [None]:
#extract the month from the checkin_date
club["Check_in_month"] = club["checkin_date"].dt.month

In [None]:
club['booking_duration'] = (club['checkin_date']-club['booking_date']).dt.days

In [None]:
club['no_of_days_spent_in_resort'] = (club['checkout_date']-club['checkin_date']).dt.days

In [None]:
club_test['booking_date']= pd.to_datetime(club_test['booking_date'], dayfirst=True)
club_test['booking_date'] =club_test['booking_date'].apply(lambda x: '20'+str(x.year) if len(str(x.year))!=4 else x)

In [None]:
club_test['checkin_date']= pd.to_datetime(club_test['checkin_date'], dayfirst=True)
club_test['checkin_date'] =club_test['checkin_date'].apply(lambda x: '20'+str(x.year) if len(str(x.year))!=4 else x)

In [None]:
club_test['checkout_date']= pd.to_datetime(club_test['checkout_date'], dayfirst=True)
club_test['checkout_date'] =club_test['checkout_date'].apply(lambda x: '20'+str(x.year) if len(str(x.year))!=4 else x)

In [None]:
#extract the month from the checkin_date
club_test["Check_in_month"] = club_test["checkin_date"].dt.month

In [None]:
club_test['booking_duration'] = (club_test['checkin_date']-club_test['booking_date']).dt.days

In [None]:
club_test['no_of_days_spent_in_resort'] = (club_test['checkout_date']-club_test['checkin_date']).dt.days

# dividing the data based on seasons:

In [None]:
# creating a seasons for various time period of the year
def months(x):
    'divide the year into four categories'
    if x < 3:
        return "Spring"
    elif 3 <= x < 6:
        return "Summer"
    elif 6 <= x < 9:
        return "Rainy"
    else:
        return "Winter"

club['Check_in_month'] = club.Check_in_month.apply(lambda x: months(x))
club_test['Check_in_month'] = club_test.Check_in_month.apply(lambda x: months(x))

In [None]:
club.Check_in_month.value_counts()

In [None]:
#missing data
total = club.isnull().sum().sort_values(ascending=False)
percent = (club.isnull().sum()/club.isnull().count()).sort_values(ascending=False)
null_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
null_data.head(10)


# since the null rows are very less we will drop these rows

In [None]:
club = club.dropna()
club_test = club_test.dropna()

In [None]:
club.info()

In [None]:
club.shape

In [None]:
#correlation matrix
corrmat = club.corr()
f, ax = plt.subplots(figsize=(20, 15))
sns.heatmap(corrmat, vmax=.8, square=True);

In [None]:
#creating correlation matrix for the principal components
#corrmat = np.corrcoef(club.transpose())

In [None]:
# 1s -> 0s in diagonals
#corrmat_nodiag = corrmat - np.diagflat(corrmat.diagonal())
#print("max corr:",corrmat_nodiag.max(), ", min corr: ", corrmat_nodiag.min(),)
# we see that correlations are indeed very close to 0

# since we have created new features using date and there are few features which are not neccessary we will drop them

In [None]:
club= club.drop(['reservation_id','memberid','booking_date','checkin_date','checkout_date'],axis=1)
club_test= club_test.drop(['reservation_id','memberid','booking_date','checkin_date','checkout_date'],axis=1)

In [None]:
# Check the columns which are of type 'object'

temp = club.loc[:,club.dtypes == 'object']
temp.columns

# Step 3- Exploratory data analysis:

# subplots
plt.figure(figsize=[15,5])
# subplot 1
plt.subplot(1, 3, 1)
plt.title('channel_code')
sns.distplot(club['channel_code'])

# subplot 2
plt.subplot(1, 3, 2)
plt.title('main_product_code')
sns.distplot(club['main_product_code'])

# subplot 3
plt.subplot(1, 3, 3)
plt.title('numberofadults')
sns.distplot(club['numberofadults'])

plt.show()

# adjust figure size
plt.figure(figsize=(15, 5))

# subplot 1:
plt.subplot(1, 3, 1)
sns.boxplot( y='channel_code', data=club)
plt.title("channel_code")
#plt.yscale('log')

# subplot 2:
plt.subplot(1, 3, 2)
sns.boxplot( y='main_product_code', data=club)
plt.title("main_product_code")
#plt.yscale('log')

# subplot 2
plt.subplot(1, 3, 3)
sns.boxplot( y='numberofadults', data=club)
plt.title("numberofadults")
#plt.yscale('log')

plt.show()

# subplots
plt.figure(figsize=[15,5])
# subplot 1
plt.subplot(1, 3, 1)
plt.title('numberofchildren')
sns.distplot(club['numberofchildren'])

# subplot 2
plt.subplot(1, 3, 2)
plt.title('persontravellingid')
sns.distplot(club['persontravellingid'])

# subplot 3
plt.subplot(1, 3, 3)
plt.title('resort_region_code')
sns.distplot(club['resort_region_code'])

plt.show()

# adjust figure size
plt.figure(figsize=(15, 5))

# subplot 1
plt.subplot(1, 3, 1)
sns.boxplot( y='numberofchildren', data=club)
plt.title("numberofchildren")
#plt.yscale('log')

# subplot 2
plt.subplot(1, 3, 2)
sns.boxplot( y='persontravellingid', data=club)
plt.title("persontravellingid")
#plt.yscale('log')

# subplot 2
plt.subplot(1, 3, 3)
sns.boxplot( y='resort_region_code', data=club)
plt.title("resort_region_code")
#plt.yscale('log')

plt.show()

# subplots
plt.figure(figsize=[15,5])
# subplot 1
plt.subplot(1, 3, 1)
plt.title('resort_type_code')
sns.distplot(club['resort_type_code'])

# subplot 2
plt.subplot(1, 3, 2)
plt.title('room_type_booked_code')
sns.distplot(club['room_type_booked_code'])

# subplot 3
plt.subplot(1, 3, 3)
plt.title('roomnights')
sns.distplot(club['roomnights'])

plt.show()

# adjust figure size
plt.figure(figsize=(15, 5))

# subplot 1
plt.subplot(1, 3, 1)
sns.boxplot( y='resort_type_code', data=club)
plt.title("resort_type_code")
#plt.yscale('log')

# subplot 2
plt.subplot(1, 3, 2)
sns.boxplot( y='room_type_booked_code', data=club)
plt.title("room_type_booked_code")
#plt.yscale('log')

# subplot 2
plt.subplot(1, 3, 3)
sns.boxplot( y='roomnights', data=club)
plt.title("roomnights")
#plt.yscale('log')

plt.show()

# adjust figure size
plt.figure(figsize=(15, 5))

# subplot 1
plt.subplot(1, 3, 2)
plt.title('season_holidayed_code')
sns.distplot(club['season_holidayed_code'])

# subplot 2
plt.subplot(1, 3, 2)
plt.title('state_code_residence')
sns.distplot(club['state_code_residence'])

# subplot 3
plt.subplot(1, 3, 3)
plt.title('state_code_resort')
sns.distplot(club['state_code_resort'])

plt.show()

# adjust figure size
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 2)
sns.boxplot( y='season_holidayed_code', data=club)
plt.title("season_holidayed_code")
#plt.yscale('log')



plt.subplot(1, 3, 2)
sns.boxplot( y='state_code_residence', data=club)
plt.title("state_code_residence")
#plt.yscale('log')


plt.subplot(1, 3, 3)
sns.boxplot( y='state_code_resort', data=club)
plt.title("state_code_resort")
#plt.yscale('log')

plt.show()

# subplots
plt.figure(figsize=[15,5])
# subplot 1
plt.subplot(1, 3, 1)
plt.title('total_pax')
sns.distplot(club['total_pax'])

# subplot 2
plt.subplot(1, 3, 2)
plt.title('booking_type_code')
sns.distplot(club['booking_type_code'])


# adjust figure size
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.boxplot( y='total_pax', data=club)
plt.title("total_pax")
#plt.yscale('log')


plt.subplot(1, 3, 2)
sns.boxplot( y='booking_type_code', data=club)
plt.title("booking_type_code")
#plt.yscale('log')






total = float(len(club))   # finding total number of records
plt.rcParams["figure.figsize"] = (8, 6)    # setting size of plot
ax= sns.countplot(x='no_of_days_spent_in_resort',palette="pastel",data=club);    # creating a new plot based on status and hour
for p in ax.patches:      # to create percentage of each bar
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{0:.0%}'.format(height/total),
            ha="center") 

In [None]:


total = float(len(club))   # finding total number of records
plt.rcParams["figure.figsize"] = (8, 6)    # setting size of plot
ax= sns.countplot(x='channel_code',palette="pastel",data=club);    # creating a new plot based on status and hour
for p in ax.patches:      # to create percentage of each bar
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{0:.0%}'.format(height/total),
            ha="center") 

In [None]:


total = float(len(club))   # finding total number of records
plt.rcParams["figure.figsize"] = (8, 6)    # setting size of plot
ax= sns.countplot(x='main_product_code',palette="pastel",data=club);    # creating a new plot based on status and hour
for p in ax.patches:      # to create percentage of each bar
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{0:.0%}'.format(height/total),
            ha="center") 

In [None]:



total = float(len(club))   # finding total number of records
plt.rcParams["figure.figsize"] = (8, 6)    # setting size of plot
ax= sns.countplot(x='numberofadults',palette="pastel",data=club);    # creating a new plot based on status and hour
for p in ax.patches:      # to create percentage of each bar
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{0:.0%}'.format(height/total),
            ha="center") 

In [None]:

total = float(len(club))   # finding total number of records
plt.rcParams["figure.figsize"] = (8, 6)    # setting size of plot
ax= sns.countplot(x='numberofchildren',palette="pastel",data=club);    # creating a new plot based on status and hour
for p in ax.patches:      # to create percentage of each bar
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{0:.0%}'.format(height/total),
            ha="center") 

In [None]:

total = float(len(club))   # finding total number of records
plt.rcParams["figure.figsize"] = (8, 6)    # setting size of plot
ax= sns.countplot(x='persontravellingid',palette="pastel",data=club);    # creating a new plot based on status and hour
for p in ax.patches:      # to create percentage of each bar
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{0:.0%}'.format(height/total),
            ha="center") 

In [None]:

total = float(len(club))   # finding total number of records
plt.rcParams["figure.figsize"] = (8, 6)    # setting size of plot
ax= sns.countplot(x='resort_region_code',palette="pastel",data=club);    # creating a new plot based on status and hour
for p in ax.patches:      # to create percentage of each bar
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{0:.0%}'.format(height/total),
            ha="center") 

In [None]:

total = float(len(club))   # finding total number of records
plt.rcParams["figure.figsize"] = (8, 6)    # setting size of plot
ax= sns.countplot(x='resort_type_code',palette="pastel",data=club);    # creating a new plot based on status and hour
for p in ax.patches:      # to create percentage of each bar
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{0:.0%}'.format(height/total),
            ha="center") 

In [None]:

total = float(len(club))   # finding total number of records
plt.rcParams["figure.figsize"] = (8, 6)    # setting size of plot
ax= sns.countplot(x='room_type_booked_code',palette="pastel",data=club);    # creating a new plot based on status and hour
for p in ax.patches:      # to create percentage of each bar
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{0:.0%}'.format(height/total),
            ha="center") 

In [None]:

total = float(len(club))   # finding total number of records
plt.rcParams["figure.figsize"] = (8, 6)    # setting size of plot
ax= sns.countplot(x='roomnights',palette="pastel",data=club);    # creating a new plot based on status and hour
for p in ax.patches:      # to create percentage of each bar
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{0:.0%}'.format(height/total),
            ha="center") 

In [None]:

total = float(len(club))   # finding total number of records
plt.rcParams["figure.figsize"] = (8, 6)    # setting size of plot
ax= sns.countplot(x='season_holidayed_code',palette="pastel",data=club);    # creating a new plot based on status and hour
for p in ax.patches:      # to create percentage of each bar
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{0:.0%}'.format(height/total),
            ha="center") 

In [None]:

total = float(len(club))   # finding total number of records
plt.rcParams["figure.figsize"] = (8, 6)    # setting size of plot
ax= sns.countplot(x='state_code_residence',palette="pastel",data=club);    # creating a new plot based on status and hour
for p in ax.patches:      # to create percentage of each bar
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{0:.0%}'.format(height/total),
            ha="center") 

In [None]:

total = float(len(club))   # finding total number of records
plt.rcParams["figure.figsize"] = (8, 6)    # setting size of plot
ax= sns.countplot(x='state_code_resort',palette="pastel",data=club);    # creating a new plot based on status and hour
for p in ax.patches:      # to create percentage of each bar
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{0:.0%}'.format(height/total),
            ha="center") 

In [None]:

total = float(len(club))   # finding total number of records
plt.rcParams["figure.figsize"] = (8, 6)    # setting size of plot
ax= sns.countplot(x='total_pax',palette="pastel",data=club);    # creating a new plot based on status and hour
for p in ax.patches:      # to create percentage of each bar
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{0:.0%}'.format(height/total),
            ha="center") 

In [None]:

total = float(len(club))   # finding total number of records
plt.rcParams["figure.figsize"] = (8, 6)    # setting size of plot
ax= sns.countplot(x='member_age_buckets',palette="pastel",data=club);    # creating a new plot based on status and hour
for p in ax.patches:      # to create percentage of each bar
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{0:.0%}'.format(height/total),
            ha="center") 

In [None]:

total = float(len(club))   # finding total number of records
plt.rcParams["figure.figsize"] = (8, 6)    # setting size of plot
ax= sns.countplot(x='booking_type_code',palette="pastel",data=club);    # creating a new plot based on status and hour
for p in ax.patches:      # to create percentage of each bar
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{0:.0%}'.format(height/total),
            ha="center") 

In [None]:

total = float(len(club))   # finding total number of records
plt.rcParams["figure.figsize"] = (8, 6)    # setting size of plot
ax= sns.countplot(x='cluster_code',palette="pastel",data=club);    # creating a new plot based on status and hour
for p in ax.patches:      # to create percentage of each bar
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{0:.0%}'.format(height/total),
            ha="center") 

In [None]:

total = float(len(club))   # finding total number of records
plt.rcParams["figure.figsize"] = (8, 6)    # setting size of plot
ax= sns.countplot(x='reservationstatusid_code',palette="pastel",data=club);    # creating a new plot based on status and hour
for p in ax.patches:      # to create percentage of each bar
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{0:.0%}'.format(height/total),
            ha="center") 

In [None]:

total = float(len(club))   # finding total number of records
plt.rcParams["figure.figsize"] = (8, 6)    # setting size of plot
ax= sns.countplot(x='resort_id',palette="pastel",data=club);    # creating a new plot based on status and hour
for p in ax.patches:      # to create percentage of each bar
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{0:.0%}'.format(height/total),
            ha="center") 

In [None]:

total = float(len(club))   # finding total number of records
plt.rcParams["figure.figsize"] = (8, 6)    # setting size of plot
ax= sns.countplot(x='Check_in_month',palette="pastel",data=club);    # creating a new plot based on status and hour
for p in ax.patches:      # to create percentage of each bar
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{0:.0%}'.format(height/total),
            ha="center") 

In [None]:

total = float(len(club))   # finding total number of records
plt.rcParams["figure.figsize"] = (8, 6)    # setting size of plot
ax= sns.countplot(x='booking_duration',palette="pastel",data=club);    # creating a new plot based on status and hour
for p in ax.patches:      # to create percentage of each bar
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{0:.0%}'.format(height/total),
            ha="center") 

In [None]:

total = float(len(club))   # finding total number of records
plt.rcParams["figure.figsize"] = (8, 6)    # setting size of plot
ax= sns.countplot(x='no_of_days_spent_in_resort',palette="pastel",data=club);    # creating a new plot based on status and hour
for p in ax.patches:      # to create percentage of each bar
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{0:.0%}'.format(height/total),
            ha="center") 

# we can drop resort id since it has categorical data and if we create its dummy it will be too high

In [None]:
club.drop(['resort_id'], axis = 1, inplace = True)
club_test.drop(['resort_id'], axis = 1, inplace = True)

In [None]:
club.shape

# Step 4 - Dummy variable creation :

In [None]:
# creating a seasons for various time period of the year
def ids(x):
    'divide the year into four categories'
    if x == 45:
        return "A"
    elif x == 46:
        return "B"
    elif x == 47:
        return "C"
    elif x == 4752:
        return "D"
    elif x == 4753:
        return "E"
    else:
        return "F"

club['persontravellingid'] = club.persontravellingid.apply(lambda x: ids(x))
club_test['persontravellingid'] = club_test.persontravellingid.apply(lambda x: ids(x))

In [None]:
# creating a seasons for various time period of the year
def ids(x):
    'divide  categories'
    if x == 1:
        return "A"
    elif x == 2:
        return "B"
    
    else:
        return "c"

club['channel_code'] = club.channel_code.apply(lambda x: ids(x))
club_test['channel_code'] = club_test.channel_code.apply(lambda x: ids(x))

In [None]:
# creating a seasons for various time period of the year
def ids(x):
    'divide  categories'
    if x == 1:
        return "A"
    elif x == 2:
        return "B"
    elif x == 3:
        return "C"
    elif x == 4:
        return "D"
    
    else:
        return "E"

club['main_product_code'] = club.main_product_code.apply(lambda x: ids(x))
club_test['main_product_code'] = club_test.main_product_code.apply(lambda x: ids(x))

In [None]:
# creating a seasons for various time period of the year
def ids(x):
    'divide  categories'
    if x == 1:
        return "A"
    elif x == 2:
        return "B"
    
    else:
        return "C"

club['resort_region_code'] = club.resort_region_code.apply(lambda x: ids(x))
club_test['resort_region_code'] = club_test.resort_region_code.apply(lambda x: ids(x))

In [None]:
# creating a seasons for various time period of the year
def ids(x):
    'divide  categories'
    if x == 1:
        return "A"
    elif x == 2:
        return "B"
    elif x == 3:
        return "C"
    elif x == 4:
        return "D"
    elif x == 5:
        return "E"
    elif x == 6:
        return "F"
    else:
        return "G"

club['resort_type_code'] = club.resort_type_code.apply(lambda x: ids(x))
club_test['resort_type_code'] = club_test.resort_type_code.apply(lambda x: ids(x))

In [None]:
# creating a seasons for various time period of the year
def ids(x):
    'divide the year into four categories'
    if x == 1:
        return "A"
    elif x == 2:
        return "B"
    elif x == 3:
        return "C"
    elif x == 4:
        return "D"
    elif x == 5:
        return "E"
    else:
        return "F"

club['room_type_booked_code'] = club.room_type_booked_code.apply(lambda x: ids(x))
club_test['room_type_booked_code'] = club_test.room_type_booked_code.apply(lambda x: ids(x))

In [None]:
# creating a seasons for various time period of the year
def ids(x):
    'divide the year into four categories'
    if x == 1:
        return "A"
    elif x ==2:
        return "B"
    elif x == 3:
        return "C"
    elif x == 4:
        return "D"
    elif x == 5:
        return "E"
    elif x == 6:
        return "F"
    elif x == 7:
        return "G"
    elif x == 8:
        return "H"
    elif x == 9:
        return "I"
    elif x == 10:
        return "J"
    elif x == 11:
        return "K"
    elif x == 12:
        return "L"
    
    else:
        return "M"

club['state_code_resort'] = club.state_code_resort.apply(lambda x: ids(x))
club_test['state_code_resort'] = club_test.state_code_resort.apply(lambda x: ids(x))

In [None]:

# creating a seasons for various time period of the year
def ids(x):
    'divide the year into four categories'
    if x == 1:
        return "A"
    elif x ==2:
        return "B"
    elif x == 3:
        return "C"
    elif x == 4:
        return "D"
    elif x == 5:
        return "E"
    elif x == 6:
        return "F"
    elif x == 7:
        return "G"
    elif x ==  8:
        return "H"
    elif x == 9:
        return "I"
    elif x ==10:
        return "J"
    elif x == 11:
        return "K"
    elif x == 12:
        return "L"
    elif x ==13:
        return "M"
    elif x == 14:
        return "N"
    elif x == 15:
        return "O"
    elif x == 16:
        return "P"
    elif x == 17:
        return "Q"
    elif x == 18:
        return "R"
    elif x == 19:
        return "S"
    elif x ==20:
        return "T"
    elif x ==21:
        return "U"
    elif x == 22:
        return "V"
    elif x == 23:
        return "W"
    elif x == 24:
        return "X"
    elif x ==25:
        return "Y"
    elif x == 26:
        return "Z"
    elif x == 27:
        return "AA"
    elif x == 28:
        return "AB"
    elif x ==29:
        return "AC"
    elif x ==30:
        return "AD"
    elif x == 31:
        return "AE"
    elif x == 32:
        return "AF"
    elif x == 33:
        return "AG"
    elif x ==34:
        return "AH"
    elif x == 35:
        return "AI"
    elif x == 36:
        return "AJ"
    
    
    else:
        return "AK"

club['state_code_residence'] = club.state_code_residence.apply(lambda x: ids(x))
club_test['state_code_residence'] = club_test.state_code_residence.apply(lambda x: ids(x))

In [None]:
# creating a seasons for various time period of the year
def ids(x):
    'divide  categories'
    if x == 1:
        return "A"
    else:
        return "B"

club['booking_type_code'] = club.booking_type_code.apply(lambda x: ids(x))
club_test['booking_type_code'] = club_test.booking_type_code.apply(lambda x: ids(x))

In [None]:
club=club.dropna(subset=['season_holidayed_code'])
# creating a seasons for various time period of the year
def ids(x):
    if x == 1:
        return "A"
    elif x == 2:
        return "B"
    elif x == 3:
        return "C"
    else:
        return "D"         

club['season_holidayed_code'] = club.booking_type_code.apply(lambda x: ids(x))
club_test['season_holidayed_code'] = club_test.booking_type_code.apply(lambda x: ids(x))

In [None]:
# Create dummy variables using the 'get_dummies' command
dummy = pd.get_dummies(club[['member_age_buckets', 'cluster_code', 'reservationstatusid_code','Check_in_month','persontravellingid','channel_code','main_product_code','resort_region_code','resort_type_code','room_type_booked_code','state_code_resort','state_code_residence','booking_type_code','season_holidayed_code']], drop_first=True)

In [None]:
# Create dummy variables using the 'get_dummies' command
dummy = pd.get_dummies(club_test[['member_age_buckets', 'cluster_code', 'reservationstatusid_code','Check_in_month','persontravellingid','channel_code','main_product_code','resort_region_code','resort_type_code','room_type_booked_code','state_code_resort','state_code_residence','booking_type_code','season_holidayed_code']], drop_first=True)

In [None]:
# Add the results to the master dataframe
club = pd.concat([club, dummy], axis=1)

In [None]:
club_test = pd.concat([club_test, dummy], axis=1)

In [None]:
# Drop the variables for which the dummy variables have been created

club = club.drop(['member_age_buckets', 'cluster_code', 'reservationstatusid_code',
      'Check_in_month','persontravellingid','channel_code','main_product_code','resort_region_code','resort_type_code','room_type_booked_code','state_code_resort','state_code_residence','booking_type_code','season_holidayed_code'], 1)

In [None]:
club_test = club_test.drop(['member_age_buckets', 'cluster_code', 'reservationstatusid_code',
      'Check_in_month','persontravellingid','channel_code','main_product_code','resort_region_code','resort_type_code','room_type_booked_code','state_code_resort','state_code_residence','booking_type_code','season_holidayed_code'], 1)

In [None]:
club.head(5)

In [None]:
club.columns

# Step 5- scalling of data :

In [None]:
# Checking outliers at 25%,50%,75%,90%,95% and 99%
club.describe(percentiles=[.25,.5,.75,.90,.95,.99])

In [None]:
club.columns

# as we dont have any value which sems to be out of range there is no need to scale data

# Step 6 - test train split:

In [None]:
# Import the required library

from sklearn.model_selection import train_test_split

In [None]:
# Put all the feature variables in X

X = club.drop(['amount_spent_per_room_night_scaled'], 1)
X.head()

In [None]:
# Put the target variable in y

y = club['amount_spent_per_room_night_scaled']

y.head()

In [None]:
X

In [None]:
# Split the dataset into 70% train and 30% test
from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()
X = standard_scaler.fit_transform(X)
club_test=standard_scaler.transform(club_test)
y=standard_scaler.transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)

In [None]:
X_train.shape

In [None]:
y_train.head()

# Step 7 - Model Building:

In [None]:
# Importing RFE and LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

In [None]:
# Running RFE with the output number of the variable equal to 10
lm = LinearRegression()
lm.fit(X_train, y_train)

rfe = RFE(lm, 15)             # running RFE
rfe = rfe.fit(X_train, y_train)

In [None]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

In [None]:
col = X_train.columns[rfe.support_]  # checkin the columns of the rfe given variables
club[col].astype(int)


In [None]:
X_train.columns[~rfe.support_]  #columns which are not in rfe model

In [None]:
# Creating X_test dataframe with RFE selected variables
X_train_rfe = X_train[col]
#X_train_rfe.drop(['persontravellingid'], 1)

In [None]:
# Adding a constant variable 
import statsmodels.api as sm  
X_train_rfe = sm.add_constant(X_train_rfe)

In [None]:
lm = sm.OLS(y_train,X_train_rfe).fit()   # Running the linear model

In [None]:
#Let's see the summary of our linear model
print(lm.summary())

In [None]:
# list of alphas to tune
params = {'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000 ]}


ridge = Ridge()

# cross validation
folds = 10
model_cv = GridSearchCV(estimator = ridge, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            
model_cv.fit(X_train, y_train) 

In [None]:
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results = cv_results[cv_results['param_alpha']<=200]
cv_results.head()

In [None]:
# plotting mean test and train scores with alpha 
cv_results['param_alpha'] = cv_results['param_alpha'].astype('int32')

# plotting
plt.plot(cv_results['param_alpha'], cv_results['mean_train_score'])
plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('Negative Mean Absolute Error')
plt.title("Negative Mean Absolute Error and alpha")
plt.legend(['train score', 'test score'], loc='upper left')
plt.show()

# taking the value of alpha from graph where the orange line seem to take a bend

In [None]:
alpha =3
ridge = Ridge(alpha=alpha)

ridge.fit(X_train, y_train)
ridge.coef_

In [None]:
lasso = Lasso()

# cross validation
folds = 10
model_cv = GridSearchCV(estimator = lasso, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            

model_cv.fit(X_train, y_train) 

In [None]:
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results = cv_results[cv_results['param_alpha']<=5]
cv_results

In [None]:
# plotting mean test and train scoes with alpha 
cv_results['param_alpha'] = cv_results['param_alpha'].astype('float32')

# plotting
plt.plot(cv_results['param_alpha'], cv_results['mean_train_score'])
plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('Negative Mean Absolute Error')

plt.title("Negative Mean Absolute Error and alpha")
plt.legend(['train score', 'test score'], loc='upper left')
plt.show()

# taking the value of alpha from graph where the orange line seem to take a bend

In [None]:
alpha =0.1

lasso = Lasso(alpha=alpha)
        
lasso.fit(X_train, y_train) 

In [None]:
zero=(lasso.coef_==0)
zero

In [None]:
def pretty_print_linear(coefs,names=None , sort =False ):
    if names ==None:
        names = [" X%s " % x for x in range(len(coefs))]
    lst = zip(coefs,names)
    if sort:
        lst = sorted(lst,key=lambda x : -np.abs(x[0]))
    return "+".join("%s *%s" %(round(coef,3), name)
                    for coef,name in lst)



# equation of lasso regression:


In [None]:
names=club.columns.values.tolist()
#names
print("lasso",pretty_print_linear(lasso.coef_ , names ,sort=True))

In [None]:
ridge_train_score=ridge.score(X_train,y_train)
ridge_test_score=ridge.score(X_test,y_test)
ridge_train_score,ridge_test_score

In [None]:
lasso_train_score=lasso.score(X_train,y_train)
lasso_test_score=lasso.score(X_test,y_test)
lasso_train_score,lasso_test_score

In [None]:
model_param = list(lasso.coef_)
model_param.insert(0,lasso.intercept_)
model_param = [round(x,3) for x in model_param]
cols=X.columns
cols=cols.insert(0,"constant")
param = list(zip(cols,model_param))
param

In [None]:
df_test=pd.DataFrame.from_records(param)
df_test

In [None]:
df_test=df_test[df_test[1]!=0]
df_test.head()

In [None]:
col=df_test[0]
col

In [None]:
#fitting simple linear regression to the training set
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(X_train,y_train)

In [None]:
#predict the test result
y_pred=regressor.predict(X_test)

In [None]:
print(regressor.intercept_)

In [None]:
print(regressor.coef_)

In [None]:
zip(X,regressor.coef_)

In [None]:
y_pred=regressor.predict(X_test)

In [None]:
from sklearn import metrics
print(metrics.mean_absolute_error(y_test,y_pred))

In [None]:
print(metrics.mean_squared_error(y_test,y_pred))

In [None]:
print(np.sqrt(metrics.mean_squared_error(y_test,y_pred)))