In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder, LabelEncoder
from joblib import dump, load
import time
import datetime
import re
import pickle

In [25]:
### preprocessing 

def split_category(value):
    vals = []
    if '|' in value:
        vals = value.split('|')
    else:
        vals.append(value)
    return vals

def save_unique(data, col, filename):
    unique_vals = data[col].unique()
    unique_vals = [i.title() for i in unique_vals]
    unique_vals = pd.DataFrame(unique_vals, columns=[col])
    unique_vals.to_csv(filename, index=False)

def split_columns_with_multiple_values(data):
    data['Airline'] = data['Airline'].apply(split_category)
    data['Places Covered'] = data['Places Covered'].apply(split_category)
    data['Sightseeing Places Covered'] = data['Sightseeing Places Covered'].apply(split_category)
    data['Hotel Details'] = data['Hotel Details'].apply(split_category)    
    
    prepare_itinerary(data)
    
    return data

def one_hot_encoding_on_columns(data):
    for col in data.columns:
        if col in [
            'Itinerary',
            'Sightseeing Places Covered',
            'Places Covered',
            'Hotel Details',
            'Airline',
        ]:
            encoder = MultiLabelBinarizer()
            encoder.fit(data[col])
            dump(encoder, 'encoder_{0}.joblib'.format(col.replace(" ", "_")))
            df = pd.DataFrame(encoder.transform(data[col]), columns=encoder.classes_)
            data.drop(columns=[col], inplace=True)
            data = pd.concat([data, df], axis=1)    

            encoder_classes = pd.DataFrame(encoder.classes_, columns=[col])
            encoder_classes.to_csv('classes_{0}.csv'.format(col.replace(" ", "_")), index=False)
           
        elif data[col].dtype == 'object':
            encoder = LabelEncoder()
            encoder.fit(data[col])
            np.save('encoder_{0}.npy'.format(col.replace(" ", "_")), encoder.classes_, allow_pickle=True)
            data[col] = encoder.transform(data[col])

    return data

def show_category(series):
    values = {}
    for val in series:
        for each in val:
            if each in values:
                values[each] += 1
            else:
                values[each] = 1
    return values

def make_feature_col(series, all_keys):
    feature_dict = {key: [] for key in all_keys}
    for items in series:
        for key in all_keys:
            if key not in items:
                feature_dict[key].append(0)
            else:
                feature_dict[key].append(1)

    return pd.DataFrame(feature_dict)

def prepare_itinerary(data):
    data['Itinerary'] = data['Itinerary'].str.split(".")
    vals = data['Itinerary']
    for j in range(len(vals)):
        for p in range(len(vals[j])):
            vals[j][p] = re.sub(r'[^0-9N]', '', vals[j][p])
            vals[j][p] = vals[j][p].split('N')
            vals[j][p] = list(filter(None, vals[j][p]))
            vals[j][p] = [int(i) for i in vals[j][p]]
        vals[j] = [item for sublist in vals[j] for item in sublist]
    data['Itinerary'] = vals 

def remove_useless_columns(data):
    data.drop('Uniq Id', inplace=True, axis=1)
    data.drop('Package Name', inplace=True, axis=1)
    data.drop('Destination', inplace=True, axis=1)
    data.drop('Cancellation Rules', inplace=True, axis=1)
    return data

def prepare_hotel_details(data):
    data['Hotel Details'] = [re.sub(r':.*', '', i) for i in data['Hotel Details']]
    data['Hotel Details'] = [i.lower() for i in data['Hotel Details']]
    data['Hotel Details'] = data['Hotel Details'].str.replace(" - mmt holidays special",'')
    data['Hotel Details'] = data['Hotel Details'].str.replace("- mmt holidays special",'')
    data['Hotel Details'] = data['Hotel Details'].str.replace("-mmt holidays special",'')
    data['Hotel Details'] = data['Hotel Details'].str.replace("(mmt holidays special)",'')
    data['Hotel Details'] = data['Hotel Details'].str.replace(" - mmt special",'')
    data['Hotel Details'] = data['Hotel Details'].str.replace("- mmt speciall",'')
    data['Hotel Details'] = data['Hotel Details'].str.replace("-mmt special",'')
    data['Hotel Details'] = data['Hotel Details'].str.replace("mmt special",'')
  
    hotel_details = data['Hotel Details'].unique()
    hotel_details = [i.title() for i in hotel_details]
    hotel_details = pd.DataFrame(hotel_details, columns=['Hotel Details'])
    hotel_details.to_csv('hotel_details.csv', index=False)

    return data

def convert_dates_to_one_format(data):
    data['Travel Date'] = pd.to_datetime(data['Travel Date'], errors='coerce')
    data['Travel Date'] = data['Travel Date'].astype(np.int64) // 10**9    
    return data

def replace_wrong_encoding_to_utf8(data, column_name):
    strings_to_encode = ['â€”', 'â€“', 'â€˜', 'â€™', 'â€œ', 'â€']
    for cell in data[column_name]:
        if cell in strings_to_encode:
            encoded = cell.encode('cp1252')
            cell = encoded.decode('utf-8')

    return data

def remove_unexpected_numbers(data, column_name):
    for cell in data[column_name]:
        if has_numbers(cell):
            data.drop(labels=cell, inplace=False)
    return data

def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

def preprocess_data(data):
    data = remove_useless_columns(data)
    print(data.shape)
    data = remove_unexpected_numbers(data, 'Places Covered')
    print(data.shape)
    data = convert_dates_to_one_format(data)
    data = replace_wrong_encoding_to_utf8(data, 'Sightseeing Places Covered')
    data = prepare_hotel_details(data)
    data = split_columns_with_multiple_values(data)
    data = one_hot_encoding_on_columns(data)
    return(data)

In [26]:
train = pd.read_csv('dataset\Train.csv')
validation = pd.read_csv('dataset\Test.csv')

In [27]:
final_train_data = preprocess_data(train)
final_validation_data = preprocess_data(validation)

(21000, 11)
(21000, 11)


  data['Travel Date'] = pd.to_datetime(data['Travel Date'], errors='coerce')
  data['Hotel Details'] = data['Hotel Details'].str.replace("(mmt holidays special)",'')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vals[j] = [item for sublist in vals[j] for item in sublist]


(9000, 10)


KeyError: "['30-01-2021'] not found in axis"

In [8]:
### Displaying processed data

print('================TRAINING DATA==================')
display(final_train_data)



Unnamed: 0,Package Type,Travel Date,Start City,Flight Stops,Meals,Per Person Price,1,2,3,4,...,Yala Safari,Yuksam,Yumthang Valley,Zero Point,Zip Lining at Mehrangarh Fort - MMT,Zong Dog Palri Fo Brang Gompa,Zoological Park,bandipur National park,bandipur national park,Not Available
0,4,1627603200,0,2,3,11509.0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,3,1628726400,1,0,5,22485.5,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,1619395200,1,0,4,12421.5,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,1630022400,1,0,5,35967.0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3,1639267200,1,0,5,25584.0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20995,1,1618444800,0,1,3,16168.5,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
20996,2,1630281600,1,0,4,33770.5,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
20997,1,1614384000,1,1,3,29964.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20998,1,1642032000,1,1,3,28145.5,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
print('================VALIDATION DATA==================')
display(final_validation_data)



Unnamed: 0,Package Type,Travel Date,Start City,Flight Stops,Meals,1,2,3,4,5,...,Yala Safari,Yuksam,Yumthang Valley,Zero Point,Zip Lining at Mehrangarh Fort - MMT,Zong Dog Palri Fo Brang Gompa,Zoological Park,bandipur National park,bandipur national park,Not Available
0,2,1626134400,1,0,4,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,2,1616198400,1,0,4,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,1662076800,1,2,3,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1621123200,0,1,3,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,1634083200,1,0,4,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8995,4,1617667200,1,2,3,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8996,1,1625270400,0,1,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8997,2,1629504000,0,0,4,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8998,1,1616112000,1,1,3,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
### Saving processed data to csv

final_train_data.to_csv('dataset\Train_prepared.csv') 
final_validation_data.to_csv('dataset\Test_prepared.csv')