In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import streamlit as st
from pycaret.regression import *
import re

In [11]:
### preprocessing 

def split_category(value):
    vals = []
    if '|' in value:
        vals = value.split('|')
    else:
        vals.append(value)
    return vals

def split_columns_with_multiple_values(data):
    data['Airline'] = data['Airline'].apply(split_category)
    data['Places Covered'] = data['Places Covered'].apply(split_category)
    data['Sightseeing Places Covered'] = data['Sightseeing Places Covered'].apply(split_category)
    data['Hotel Details'] = data['Hotel Details'].apply(split_category)    
    
    prepare_itinerary(data)
    
    return data

def one_hot_encoding_on_columns(data):
    dummy_type = pd.get_dummies(data['Package Type'], prefix='type')
    data.drop(columns=['Package Type'], inplace=True)
    data = pd.concat([data, dummy_type], axis=1)

    dummy_city = pd.get_dummies(data['Start City'], prefix='sc')
    data.drop(columns=['Start City'], inplace=True)
    data = pd.concat([data, dummy_city], axis=1)
    return data

def show_category(series):
    values = {}
    for val in series:
        for each in val:
            if each in values:
                values[each] += 1
            else:
                values[each] = 1
    return values

def make_feature_col(series, all_keys):
    feature_dict = {key: [] for key in all_keys}
    for items in series:
        for key in all_keys:
            if key not in items:
                feature_dict[key].append(0)
            else:
                feature_dict[key].append(1)

    return pd.DataFrame(feature_dict)

def structuring_columns(data):
    change = lambda pc: [each+'_ae' for each in pc]
    A_all_keys = show_category(data['Airline']).keys()
    A_all_keys = change(A_all_keys)
    airline = make_feature_col(data['Airline'], A_all_keys)
    data = pd.concat([data, airline], axis=1)
    
    change = lambda pc: [each+'_pc' for each in pc]
    PC_all_keys = show_category(data['Places Covered']).keys()
    PC_all_keys = change(PC_all_keys)
    p_covered = make_feature_col(data['Places Covered'], PC_all_keys)
    data = pd.concat([data, p_covered], axis=1)

    change = lambda pc: [each+'_hd' for each in pc]
    HD_all_keys = show_category(data['Hotel Details']).keys()
    HD_all_keys = change(PC_all_keys)
    h_details = make_feature_col(data['Hotel Details'], HD_all_keys)
    data = pd.concat([data, h_details], axis=1)

    return data

def prepare_itinerary(data):
    data['Itinerary'] = data['Itinerary'].str.split(".")
    for j in data['Itinerary']:
        j = re.sub(r'[^0-9N]', '', j)
        j = data['Itinerary'].split('N')
        j = list(filter(None, j))
        j = [int(i) for i in j]
    

def remove_useless_columns(data):
    data.drop('Uniq Id', inplace=True, axis=1)
    data.drop('Package Name', inplace=True, axis=1)
    data.drop('Destination', inplace=True, axis=1)
    data.drop('Cancellation Rules', inplace=True, axis=1)
    return data

def hotel_details_encoding(data):
    data['Hotel Details'] = data['Hotel Details'].str.replace(":One",':1.0')
    data['Hotel Details'] = data['Hotel Details'].str.replace(":Two",':2.0')
    data['Hotel Details'] = data['Hotel Details'].str.replace(":Three",':3.0')
    data['Hotel Details'] = data['Hotel Details'].str.replace(":Four",':4.0')
    data['Hotel Details'] = data['Hotel Details'].str.replace(":Five",':5.0')

    data['Hotel Details'] = data['Hotel Details'].str.replace(" - mmt holidays special",'')
    data['Hotel Details'] = data['Hotel Details'].str.replace("- mmt holidays special",'')
    data['Hotel Details'] = data['Hotel Details'].str.replace("-mmt holidays special",'')
    data['Hotel Details'] = data['Hotel Details'].str.replace("(mmt holidays special)",'')
    data['Hotel Details'] = data['Hotel Details'].str.replace(" - mmt special",'')
    data['Hotel Details'] = data['Hotel Details'].str.replace("- mmt speciall",'')
    data['Hotel Details'] = data['Hotel Details'].str.replace("-mmt special",'')
    data['Hotel Details'] = data['Hotel Details'].str.replace("mmt special",'')
    return data

def convert_dates_to_one_format(data):
    data['Travel Date'] = pd.to_datetime(data['Travel Date'], errors='coerce')
    data['Travel Date'] = data['Travel Date'].dt.strftime('%d-%m-%Y')
    return data

def preprocess_data(data):
    data = remove_useless_columns(data)
    data = convert_dates_to_one_format(data)
    data = hotel_details_encoding(data)
    data = split_columns_with_multiple_values(data)
    data = one_hot_encoding_on_columns(data)
    data = structuring_columns(data)
    return(data)

In [12]:
train = pd.read_csv('dataset\Train.csv')
validation = pd.read_csv('dataset\Test.csv')

In [13]:
final_train_data = preprocess_data(train)
final_validation_data = preprocess_data(validation)

TypeError: expected string or bytes-like object

In [None]:
### Displaying processed data

print('================TRAINING DATA==================')
display(final_train_data)
print(final_train_data.shape)

print('================VALIDATION DATA==================')
display(final_validation_data)
print(final_validation_data.shape)

In [71]:
### Saving processed data to csv

final_train_data.to_csv('dataset\Train_prepared.csv') 
final_validation_data.to_csv('dataset\Test_prepared.csv')