In [22]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from os import listdir
from os.path import isfile, join
from sklearn.model_selection import train_test_split

In [23]:
data = pd.read_csv(r"touristData.csv")
data.head()

Unnamed: 0,Gender,Married,Age Category,Ethnicity,Degree,Choose option that best describe your career path,Choose your preferred tourism category
0,Female,No,29 - 35,Black or African American,Bachelor,"Arts, Design, Entertainment, Sports, and Media",Historical
1,Male,No,23 - 28,Black or African American,Master,"Life, Physical and Social Science",Tourist
2,Male,Yes,23 - 28,Black or African American,Diploma,Computer and Mathematical,Tourist
3,Male,No,23 - 28,Middle Eastern,Bachelor,"Life, Physical and Social Science",Historical
4,Female,Yes,23 - 28,Native Hawaiin,Diploma,Building and Grounds Cleaning and Maintenance,Historical and Islamic


In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 7 columns):
 #   Column                                             Non-Null Count  Dtype 
---  ------                                             --------------  ----- 
 0   Gender                                             103 non-null    object
 1   Married                                            103 non-null    object
 2   Age Category                                       103 non-null    object
 3   Ethnicity                                          103 non-null    object
 4   Degree                                             103 non-null    object
 5   Choose option that best describe your career path  99 non-null     object
 6   Choose your preferred tourism category             103 non-null    object
dtypes: object(7)
memory usage: 5.8+ KB


In [25]:
data['career path'] = data['Choose option that best describe your career path']

In [26]:
data['category'] = data['Choose your preferred tourism category']

In [27]:
data.drop(['Choose option that best describe your career path','Choose your preferred tourism category'], axis =1, inplace =True)

In [28]:
data['career path'].fillna('Arts, Design, Entertainment, Sports, and Media', inplace=True)
data['Ethnicity'].unique()

array(['Black  or  African American', 'Middle Eastern', 'Native Hawaiin',
       'Hispanic or Latino', 'White', 'American Indian or  Alaska Native',
       'Ethnicity'], dtype=object)

In [29]:
data['Ethnicity'].replace('Black  or  African American', 'Black or African American', inplace = True)
data['Ethnicity'].replace('American Indian or  Alaska Native', 'American Indian or Alaska Native', inplace =True)
data['career path'].replace('Hospitality', 'Food Preparation and Serving Related', inplace =True)

In [30]:
data['career path'].unique()

array(['Arts, Design, Entertainment, Sports, and Media',
       'Life, Physical and Social Science', 'Computer and Mathematical',
       'Building and Grounds Cleaning and Maintenance',
       'Architecture and Engineering',
       'Food Preparation and Serving Related',
       'Farming, Fishing, and Forestry',
       'Educational Instruction and Library',
       'Healthcare Practitioners and Technical', 'Sales and Related',
       'Legal', 'Choose option that best describe your career path'],
      dtype=object)

In [31]:
def catid(x):
    if x == 'Historical':
        return 1
    elif x == 'category':
        return 0
    elif x == 'Tourist':
        return 0
    elif x == 'Islamic':
        return 2
    elif x == 'Historical and Islamic':
        return 3
    elif x == 'Choose your preferred tourism category':
        return 0

def gender(x):
    if x == 'Male':
        return 0
    elif x == 'Female':
        return 1
    elif x == 'Unspecified':
        return 2
    elif x == 'Gender':
        return 3
    
def age(x):
    if x == '5 - 10':
        return 0
    elif x == '11 - 17':
        return 1
    elif x == '18 - 22':
        return 2
    elif x == '23 - 28':
        return 3
    elif x == '29 - 35':
        return 4
    elif x == '36 - 45':
        return 5
    elif x == '49 - 55':
        return 6
    elif x == '55 - older':
        return 7
    elif x == 'Age Category':
        return 0

data['categoryid'] = data['category'].apply(catid)

In [32]:
def ethnic(x):
    if x == 'American Indian or Alaska Native':
        return 0
    elif x == 'Asian':
        return 1
    elif x == 'Black or African American':
        return 2
    elif x == 'Hispanic or Latino':
        return 3
    elif x == 'Native Hawaiin':
        return 4
    elif x == 'White':
        return 5
    elif x == 'Middle Eastern':
        return 6
    elif x == 'Ethnicity':
        return 0

def degree(x):
    
    if x == 'Uneducated':
        return 0
    elif x == 'Secondary':
        return 1
    elif x == 'Diploma':
        return 2
    elif x == 'Bachelor':
        return 3
    elif x == 'Master':
        return 4
    elif x == 'Doctorate':
        return 5
    elif x == 'Degree':
        return 0
    
def status(x):
    if x == 'No':
        return 0
    elif x == 'Yes':
        return 1
    elif x == 'Married':
        return 0

In [33]:
def job(x):
    if x == 'Business and Financial Occupations':
        return 0
    elif x == 'Computer and Mathematical':
        return 1
    elif x == 'Architecture and Engineering':
        return 2
    elif x == 'Life, Physical and Social Science':
        return 3
    elif x == 'Community and Social Service':
        return 4
    elif x == 'Legal':
        return 5
    elif x == 'Educational Instruction and Library':
        return 6
    elif x == 'Arts, Design, Entertainment, Sports, and Media':
        return 7
    elif x == 'Healthcare Practitioners and Technical':
        return 8
    elif x == 'Food Preparation and Serving Related':
        return 9
    elif x == 'Building and Grounds Cleaning and Maintenance':
        return 10
    elif x == 'Personal Care and Service':
        return 11
    elif x == 'Sales and Related':
        return 12
    elif x == 'Office and Administrative Support':
        return 13
    elif x == 'Farming, Fishing, and Forestry':
        return 14
    elif x == 'Construction and Extraction':
        return 15
    elif x == 'Installation, Maintenance, and Repair':
        return 16
    elif x == 'Transportation and Material Moving':
        return 17
    elif x == 'Choose option that best describe your career path':
        return 0

In [34]:
data['Age'] = data['Age Category'].apply(age)
data['Sex'] = data['Gender'].apply(gender)
data['Ethnic'] = data['Ethnicity'].apply(ethnic)
data['Qualification'] = data['Degree'].apply(degree)
data['career'] = data['career path'].apply(job)
data['Status'] = data['Married'].apply(status)

In [35]:
data.drop(['Age Category', 'Gender', 'Ethnicity', 'Degree', 'career path', 'category','Married'], axis=1, inplace =True)

In [36]:
data.head(2)

Unnamed: 0,categoryid,Age,Sex,Ethnic,Qualification,career,Status
0,1,4,1,2,3,7,0
1,0,3,0,2,4,3,0


In [37]:
y  = data['categoryid']

In [38]:
x = data.drop(['categoryid'], axis = 1)
x.head(2)

Unnamed: 0,Age,Sex,Ethnic,Qualification,career,Status
0,4,1,2,3,7,0
1,3,0,2,4,3,0


In [39]:
saved = x.to_csv('test.csv')

In [40]:
#X = pd.get_dummies(x, columns = ['Age','Sex', 'Ethnic', 'Qualification', 'Status'])

In [41]:
#from sklearn.preprocessing import MinMaxScaler
#sc = MinMaxScaler()
#Xsc = sc.fit_transform(x)
#x.columns

In [42]:
# = pd.DataFrame(Xsc, columns=['Age', 'Sex', 'Ethnic', 'Qualification', 'career', 'Status'])

In [43]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 1/4, random_state = 2)

In [44]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import pickle

In [45]:
from pandas.plotting import scatter_matrix
from sklearn import model_selection
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [46]:
classifier = []
classifier.append(("LogisticReg", LogisticRegression(solver='liblinear',multi_class='ovr')))
classifier.append(("DecisionTree", DecisionTreeClassifier(criterion ='entropy')))
classifier.append(("KNN", KNeighborsClassifier()))
classifier.append(("KernelSVM", SVC(gamma='auto')))
classifier.append(("NaiveBayes", GaussianNB()))
classifier.append(("RandomForest", RandomForestClassifier()))

In [48]:
seed = 0
results = []
names = []
for name, model in classifier:
    kfold = model_selection.KFold(n_splits=10)
    cv_results = model_selection.cross_val_score(model, xtrain, ytrain,cv=kfold)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LogisticReg: 0.576786 (0.212200)
DecisionTree: 0.973214 (0.053720)
KNN: 0.842857 (0.128968)
KernelSVM: 0.973214 (0.053720)
NaiveBayes: 0.596429 (0.122735)
RandomForest: 0.973214 (0.053720)


In [54]:
model = DecisionTreeClassifier()
model.fit(xtrain, ytrain)

DecisionTreeClassifier()

In [55]:
ypred = model.predict(xtest)
ypred

array([2, 0, 1, 0, 3, 2, 0, 3, 2, 2, 0, 0, 1, 1, 2, 2, 1, 1, 1, 2, 1, 2,
       1, 1, 2, 2], dtype=int64)

In [56]:
print("R2 score : %.2f" % r2_score(ytest,ypred))

R2 score : 0.84


In [57]:
print("Mean squared error: %.2f" % mean_squared_error(ytest,ypred))

Mean squared error: 0.12


In [58]:
filename = 'tourist_model1.sav'
pickle.dump(model, open(filename, 'wb'))

In [1]:
import sklearn

In [6]:
sklearn.__version__

'0.21.2'