In [1]:
import pickle

import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn.metrics as metrics
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.utils import shuffle

In [16]:
all_data = pd.read_csv('train.csv')

In [19]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37670293 entries, 0 to 37670292
Data columns (total 26 columns):
date_time                    int64
site_name                    int64
posa_continent               int64
user_location_country        int64
user_location_region         int64
user_location_city           int64
orig_destination_distance    float64
user_id                      int64
is_mobile                    int64
is_package                   int64
channel                      int64
srch_ci                      object
srch_co                      object
srch_adults_cnt              int64
srch_children_cnt            int64
srch_rm_cnt                  int64
srch_destination_id          int64
srch_destination_type_id     int64
is_booking                   int64
cnt                          int64
hotel_continent              int64
hotel_country                int64
hotel_market                 int64
hotel_cluster                int64
year                         int64
month 

In [17]:
all_data["date_time"] = pd.to_datetime(all_data["date_time"])
all_data["year"] = all_data["date_time"].dt.year
all_data["month"] = all_data["date_time"].dt.month
all_data["date_time"] = all_data["date_time"].dt.day

In [21]:
all_data["srch_ci"] = pd.to_datetime(all_data["srch_ci"], format='%Y-%m-%d', errors="coerce")
all_data["srch_ci_year"] = all_data["srch_ci"].dt.year
all_data["srch_ci_month"] = all_data["srch_ci"].dt.month
all_data["srch_ci_day"] = all_data["srch_ci"].dt.day

In [22]:
all_data["srch_co"] = pd.to_datetime(all_data["srch_co"], format='%Y-%m-%d', errors="coerce")
all_data["srch_co_year"] = all_data["srch_co"].dt.year
all_data["srch_co_month"] = all_data["srch_co"].dt.month
all_data["srch_co_day"] = all_data["srch_co"].dt.day

In [23]:
for key in all_data.keys():
    all_data = all_data[pd.notnull(all_data[key])]

In [None]:
hotel_id_set = set(all_data['hotel_cluster'])
train = None
test = None
# all_data = all_data[all_data['is_booking'] == 1]
# total = 0
for hotel_id in hotel_id_set:
    flt = all_data[all_data['hotel_cluster'] == hotel_id]
    flt = shuffle(flt)
    l = len(flt)
    train_rows = int(l * 0.7)
    if train is None:
        train = flt[:train_rows]
        test = flt[train_rows:]
    else:
        train = pd.concat([train, flt[:train_rows]])
        test = pd.concat([test, flt[train_rows:]])

In [None]:
print(train.shape)
print(test.shape)
print(all_data.shape)
train.to_csv('train_naive.csv', index=False)
test.to_csv('test_naive.csv', index=False)
print("csv files written to train_naive.csv, test_naive.csv'")

In [None]:
train, test = read_train_test_csv()
print("train shape: \n", train.shape)
print("test shape: \n", test.shape)
# print("train keys: \n", train.keys())
# print(train.head(1))
# print("shape of 30 columns train:", train.ix[:, 30:].shape)
performances = []
performances.append(["Naive Bayes", classify_gaussian_nb(train, test)])
performances.append(["Random Forest", classify_random_forest(train, test)])
performances.append(["Logistic Regression", classify_logistic_regression(train, test)])
performances.append(["Gradient Boosting", classify_gradient_boosting_classifier(train, test)])
for val in performances:
    print(val)

In [14]:
def read_input_split_data(all_data):
    """
    reads data from 'train.csv' and splits it evenly into train and test data maintaining a ratio of 70-30
    writes the train and test data into csv
    :return: None
    """
    print('reading data...')
    # train = pd.read_csv('train.csv')
    all_data = pd.read_csv('train.csv')
#     all_data = pd.read_csv('out.csv')
    print('read')
    all_data["date_time"] = pd.to_datetime(all_data["date_time"])
    all_data["year"] = all_data["date_time"].dt.year
    all_data["month"] = all_data["date_time"].dt.month
    all_data["date_time"] = all_data["date_time"].dt.day
    print('Reading Done date')
    
    all_data["srch_ci"] = pd.to_datetime(all_data["srch_ci"], format='%Y-%m-%d', errors="coerce")
    all_data["srch_ci_year"] = all_data["srch_ci"].dt.year
    all_data["srch_ci_month"] = all_data["srch_ci"].dt.month
    all_data["srch_ci_day"] = all_data["srch_ci"].dt.day
    print('Reading Done srch_ci')
    
    all_data["srch_co"] = pd.to_datetime(all_data["srch_co"], format='%Y-%m-%d', errors="coerce")
    all_data["srch_co_year"] = all_data["srch_co"].dt.year
    all_data["srch_co_month"] = all_data["srch_co"].dt.month
    all_data["srch_co_day"] = all_data["srch_co"].dt.day
    
    print('Reading Done All')
    
    for key in all_data.keys():
        all_data = all_data[pd.notnull(all_data[key])]

    # print(all.keys())
    hotel_id_set = set(all_data['hotel_cluster'])
    train = None
    test = None
    # all_data = all_data[all_data['is_booking'] == 1]
    # total = 0
    for hotel_id in hotel_id_set:
        flt = all_data[all_data['hotel_cluster'] == hotel_id]
        flt = shuffle(flt)
        l = len(flt)
        train_rows = int(l * 0.7)
        if train is None:
            train = flt[:train_rows]
            test = flt[train_rows:]
        else:
            train = pd.concat([train, flt[:train_rows]])
            test = pd.concat([test, flt[train_rows:]])
    print(train.shape)
    print(test.shape)
    print(all_data.shape)
    train.to_csv('train_naive.csv', index=False)
    test.to_csv('test_naive.csv', index=False)
    print("csv files written to train_naive.csv, test_naive.csv'")

In [3]:
def read_train_test_csv():
    """
    reads the train_naive and test_naive csv and returns as a tuple
    :return: train , test tuple
    """
    return pd.read_csv("train_naive.csv"), pd.read_csv("test_naive.csv")

In [5]:
def classify_random_forest(train, test):
    """
    classify using random forest
    :param train: train data
    :param test: test data
    :return: test_accuracy, train_accuracy
    """
    print("classify_random_forest", end="")
    print(" Model Fitting....")
    classifier = RandomForestClassifier(n_estimators=200)
    classifier.fit(train.ix[:, 0:30], train.ix[:, 30:].values.ravel())
    pickle.dump(classifier, open("RandomForest.dat", "wb"))
    res = classifier.predict(test.ix[:, 0:30])
    accuracy = metrics.accuracy_score(test.ix[:, 30:], res, normalize=True)
    print("Tested on testing data using model and accuracy: ", accuracy)
    res = classifier.predict(train.ix[:, 0:30])
    train_accuracy = metrics.accuracy_score(train.ix[:, 30:], res, normalize=True)
    print("Tested on Training data using model and accuracy: ", train_accuracy)
    return accuracy, train_accuracy

In [6]:
def classify_gaussian_nb(train, test):
    """
    classify using naive bayes
    :param train: train data
    :param test: test data
    :return: test_accuracy, train_accuracy
    """
    print("classify_gaussian_nb", end="")
    print(" Model Fitting....")
    classifier = GaussianNB()
    classifier.fit(train.ix[:, 0:30], train.ix[:, 30:].values.ravel())
    pickle.dump(classifier, open("GaussianNB.dat", "wb"))
    res = classifier.predict(test.ix[:, 0:30])
    accuracy = metrics.accuracy_score(test.ix[:, 30:], res, normalize=True)
    print("Tested on testing data using model and accuracy: ", accuracy)
    res = classifier.predict(train.ix[:, 0:30])
    train_accuracy = metrics.accuracy_score(train.ix[:, 30:], res, normalize=True)
    print("Tested on Training data using model and accuracy: ", train_accuracy)
    return accuracy, train_accuracy

In [7]:
def classify_gradient_boosting_classifier(train, test):
    """
    classify using gradient boosting
    :param train: train data
    :param test: test data
    :return: test_accuracy, train_accuracy
    """
    print("classify_gradient_boosting_classifier", end="")
    print(" Model Fitting....")
    classifier = GradientBoostingClassifier()
    classifier.fit(train.ix[:, 0:30], train.ix[:, 30:].values.ravel())
    pickle.dump(classifier, open("gradient.dat", "wb"))
    res = classifier.predict(test.ix[:, 0:30])
    accuracy = metrics.accuracy_score(test.ix[:, 30:], res, normalize=True)
    print("Tested on testing data using model and accuracy: ", accuracy)
    res = classifier.predict(train.ix[:, 0:30])
    train_accuracy = metrics.accuracy_score(train.ix[:, 30:], res, normalize=True)
    print("Tested on Training data using model and accuracy: ", train_accuracy)
    return accuracy, train_accuracy

In [4]:
def classify_logistic_regression(train, test):
    """
    classify using logistic regression
    :param train: train data
    :param test: test data
    :return: test_accuracy, train_accuracy
    """
    print("classify_logistic_regression", end="")
    print(" Model Fitting....")
    classifier = LogisticRegression(multi_class='ovr', C=100)
    classifier.fit(train.ix[:, 0:30], train.ix[:, 30:].values.ravel())
    pickle.dump(classifier, open("Logistic.dat", "wb"))
    res = classifier.predict(test.ix[:, 0:30])
    accuracy = metrics.accuracy_score(test.ix[:, 30:], res, normalize=True)
    print("Tested on testing data using model and accuracy: ", accuracy)
    res = classifier.predict(train.ix[:, 0:30])
    train_accuracy = metrics.accuracy_score(train.ix[:, 30:], res, normalize=True)
    print("Tested on Training data using model and accuracy: ", train_accuracy)
    return accuracy, train_accuracy

In [10]:
def main():
    """
    main function performing the reading, fitting and classifying
    :return: None
    """
    
    read_input_split_data(all_data)
    train, test = read_train_test_csv()
    print("train shape: \n", train.shape)
    print("test shape: \n", test.shape)
    # print("train keys: \n", train.keys())
    # print(train.head(1))
    # print("shape of 30 columns train:", train.ix[:, 30:].shape)
    performances = []
    performances.append(["Naive Bayes", classify_gaussian_nb(train, test)])
    performances.append(["Random Forest", classify_random_forest(train, test)])
    performances.append(["Logistic Regression", classify_logistic_regression(train, test)])
    performances.append(["Gradient Boosting", classify_gradient_boosting_classifier(train, test)])
    for val in performances:
        print(val)

In [15]:
if __name__ == '__main__':
    main()

reading data...


OutOfBoundsDatetime: Out of bounds nanosecond timestamp: 2557-08-15 00:00:00