### Import needed packages

In [None]:
import pandas as pd
import numpy as np
import re
import requests
from bs4 import BeautifulSoup

### Load all the necessary data

In [809]:
#Load the country-ISO code dataset
country = pd.read_excel('../original_data/country.xlsx')

# Load the population by country dataset
pop = pd.read_csv("../original_data/Population.csv")

#Load the climate by country dataset
clim = pd.read_csv("../original_data/Climate.csv")

#Load the currencies by country dataset
curr = pd.read_csv("../original_data/Currencies.csv")

#Load the spoken languages by country dataset
lang = pd.read_csv("../original_data/Languages.csv", encoding='Latin-1'
#Load the religion by country dataset
rel = pd.read_csv("../original_data/Religion.csv")

#Load the peace index dataset
peace = pd.read_csv("../original_data/Peace_Index.csv")

#Load Wef Travel Index data
wef = pd.read_excel('../original_data/WEF_TTDI.xlsx', header=[0, 1], sheet_name='Index Performance')

# Load average restaurant and accomodation cost data
avg_rest_hot_p = pd.read_csv("../original_data/Avg_rest_hot_prices.csv")

# Load LGBTQ dataset
lgbtq = pd.read_excel("../original_data/LGBTQ .xlsx")

#### Get Cuisine Rank data by web scraping

In [810]:
URL = "https://www.tasteatlas.com/best/cuisines?fbclid=IwAR1CFukbqGEObPMECI1SdpO_dOzeMmBjhGXvRlW8GS63JwpqUAi_0QCl4nU"
page = requests.get(URL)

In [811]:
soup = BeautifulSoup(page.content, "html.parser")

In [812]:
# get the block that contains the ranks
results = soup.find(id="BestCuisines")

In [813]:
# get a list containing the names of the countries in html format
countries = results.find_all("div", class_="top-container")
# get a list containing the ratings of the countries in html format
ratings = results.find_all("div", class_="rating with-title")

In [814]:
# extract text only from the countries html list
country_ls = []
for c in countries:
    country_ls.append(c.find('a', href=True)['href'])
print(country_ls)

['italy', 'greece', 'spain', 'japan', 'india', 'mexico', 'turkiye', 'USA', 'france', 'peru', 'china', 'brazil', 'portugal', 'poland', 'germany', 'indonesia', 'croatia', 'argentina', 'korea', 'vietnam', 'hungary', 'romania', 'philippines', 'iran', 'serbia', 'georgia', 'czech-republic', 'bulgaria', 'england', 'thailand', 'belgium', 'netherlands', 'austria', 'algeria', 'denmark', 'south-africa', 'syria', 'bih', 'malaysia', 'lebanon', 'ukraine', 'palestine', 'bangladesh', 'lithuania', 'taiwan', 'paraguay', 'pakistan', 'tunisia', 'uruguay', 'slovakia', 'egypt', 'singapore', 'afghanistan', 'ecuador', 'ethiopia', 'belarus', 'haiti', 'russia', 'north-macedonia', 'cuba', 'sri-lanka', 'sweden', 'chile', 'jamaica', 'slovenia', 'bolivia', 'venezuela', 'albania', 'northern-ireland', 'nigeria', 'colombia', 'finland', 'ireland', 'cyprus', 'estonia', 'new-zealand', 'guatemala', 'el-salvador', 'trinidad-and-tobago', 'wales', 'israel', 'azerbaijan', 'honduras', 'costa-rica', 'saudi-arabia', 'malta', 'sw

In [815]:
# extract ratings only from the ratings html list
rating_ls = []
for rating in ratings:
    # print(box)
    rating_ls.append(float(rating.find("span").text))
print(rating_ls)

[4.72, 4.69, 4.59, 4.59, 4.54, 4.53, 4.52, 4.51, 4.51, 4.51, 4.49, 4.49, 4.47, 4.44, 4.37, 4.37, 4.33, 4.33, 4.31, 4.31, 4.26, 4.25, 4.25, 4.23, 4.23, 4.23, 4.21, 4.2, 4.18, 4.16, 4.14, 4.1, 4.09, 4.07, 4.05, 4.03, 4.03, 3.99, 3.99, 3.99, 3.98, 3.98, 3.97, 3.96, 3.96, 3.96, 3.95, 3.95, 3.95, 3.94, 3.94, 3.94, 3.94, 3.93, 3.93, 3.93, 3.93, 3.92, 3.92, 3.92, 3.92, 3.91, 3.91, 3.91, 3.91, 3.91, 3.91, 3.91, 3.91, 3.91, 3.9, 3.9, 3.9, 3.9, 3.9, 3.89, 3.89, 3.88, 3.88, 3.88, 3.85, 3.85, 3.85, 3.85, 3.85, 3.84, 3.82, 3.82, 3.8, 3.8, 3.8, 3.79, 3.79, 3.69, 3.58]


In [816]:
# check if the country list and rating list length matches
len(rating_ls)==len(country_ls)

True

In [817]:
# Create Data Frame with extracted Data
cuisine_rank = pd.DataFrame({'Country': country_ls, 'Ratings': rating_ls})

## EDA and Data Cleaning, Transforming

#### Country - ISO CODE Dataset

In [915]:
country.head()

Unnamed: 0,Name,Iso3,Continent,SubContinent
0,Aruba,ABW,Latin America and the Caribbean,Caribbean
1,Andorra,ADO,Europe,Southern Europe
2,Afghanistan,AFG,"South, East and South-Eastern Asia",Southern Asia
3,Angola,AGO,Africa,Middle Africa
4,Albania,ALB,Europe,Southern Europe


In [914]:
country.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 329 entries, 0 to 328
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Name          329 non-null    object
 1   Iso3          329 non-null    object
 2   Continent     329 non-null    object
 3   SubContinent  329 non-null    object
dtypes: object(4)
memory usage: 10.4+ KB


In [858]:
country['Name'] = country['Name'].str.title()
country['Name'] = country['Name'].str.strip()

#### Population dataset

In [917]:
pop.head()

Unnamed: 0,Country,Country Code,Pop_2021
0,Aruba,ABW,107195.0
1,Africa Eastern And Southern,AFE,694665117.0
2,Afghanistan,AFG,39835428.0
3,Africa Western And Central,AFW,470898870.0
4,Angola,AGO,33933611.0


In [918]:
pop.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 264 entries, 0 to 265
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Country       264 non-null    object 
 1   Country Code  264 non-null    object 
 2   Pop_2021      264 non-null    float64
dtypes: float64(1), object(2)
memory usage: 8.2+ KB


In [919]:
pop.isna().sum()

Country         0
Country Code    0
Pop_2021        0
dtype: int64

In [920]:
pop[pop[["2021"]].isnull().any(axis=1)]

KeyError: "None of [Index(['2021'], dtype='object')] are in the [columns]"

Given the exploration done in the pop dataset, we can conclude that for the next step in the cleaning and transformations, we wish to keep only the "Country Name", " Country Code", and the most recent year 2021 (which we will rename to pop_2021). Moreover, to make sure that the dataset is homogeneous we will capitalize the first letter of each word in the "Country Name" column. We can also see that there is two null values in 2021, which will be handled.

In [859]:
# Select the columns we are interest in
pop = pop[["Country Name", "Country Code", "2021"]]

In [860]:
# Rename the columns
pop = pop.rename(columns = {"2021": "Pop_2021", "Country Name": "Country"})

In [861]:
# Capitalise the first letter of each word in the Country column, and remove any extra spaces
pop["Country"] = pop["Country"].str.title()
pop["Country"] = pop["Country"].str.strip()

In [862]:
# Remove empty rows
pop = pop.dropna()

#### Climate dataset

In [None]:
clim.head()

In [None]:
clim.info()

In [None]:
clim.isna().sum()

Given the exploration done in the clim dataset, we can conclude that for the next step in the cleaning and transformations, we wish to keep all the columns, but rename them. Moreover, to make sure that the dataset is homogeneous we will capitalize the first letter of each word in the "COUNTRY" and "DESCRIPTION" column. We can also see that there is no null values, hence no handling of missing values is needed in this case.

In [863]:
# Rename the columns
clim = clim.rename(columns = {"COUNTRY": "Country", "DESCRIPTION": "Climate"})

In [864]:
# Capitalise the first letter of each word in the Country and Climate column, and remove any extra spaces in the Country column
clim["Country"] = clim["Country"].str.title()
clim["Country"] = clim["Country"].str.strip()
clim["Climate"] = clim["Climate"].str.title()

In [865]:
# Set a new column that classifies climates 
conditions = [(clim["Climate"].str.contains("Tropical|Arid|Semiarid|Desert")),
              (clim["Climate"].str.contains("Temperate|Continental|Hot Summers And Cold Winters" )), 
              (clim["Climate"].str.contains("Mediterranean|Subtropical")),
              (clim["Climate"].str.contains("Highland")), 
              (clim["Climate"].str.contains("Polar|Antarctic")),  
              (clim["Climate"].str.contains("Equatorial"))]

choices = ["Tropical", "Temperate", "Subtropical", "Highland", "Polar", "Equatorial" ]
clim["Climate Zone"] = np.select(conditions, choices, default = "")

#### Currency dataset

In [None]:
curr.head()

In [None]:
curr.info()

In [None]:
curr.isna().sum()

In [None]:
curr[curr[["AlphabeticCode"]].isnull().any(axis=1)]

In [None]:
curr[curr[["WithdrawalDate"]].notnull().any(axis=1)]

Given the exploration done in the curr dataset, we can conclude that for the next step in the cleaning and transformations, we wish to keep only the "Entity", "Currency", and "AlphabeticCode" columns, but rename them. Moreover, to make sure that the dataset is homogeneous we will capitalize the first letter of each word in the "Entity" and "Currency" column. We can also see that there is three null values in the "AlphabeticCode" column, which will be handled. In addition, there are a number of currencies which are no longer used, we will them subset this dataset to only contain currently accepted currencies.

In [866]:
# Keep only the rows without a withdrawal date
curr = curr[curr[["WithdrawalDate"]].isnull().any(axis=1)]

In [867]:
# Select the columns to keep
curr = curr[["Entity", "Currency", "AlphabeticCode"]]

In [868]:
# Rename the columns
curr = curr.rename(columns = {"Entity": "Country", "AlphabeticCode": "Currency Code"})

In [869]:
# Capitalise the first letter of each word in the Country and Currency columns, and remove any extra spaces in the country column. Remove any text after "(" in the country column.
curr["Country"] = curr["Country"].str.title()
curr["Country"] = curr['Country'].str.partition("(")[0]
curr["Country"] = curr["Country"].str.strip()
curr["Currency"] = curr["Currency"].str.title()

In [870]:
# Fill the null values with Non Applicable
curr["Currency Code"].fillna("Non Applicable", inplace= True)

#### Language dataset

In [None]:
lang.head()

In [None]:
lang.info()

In [None]:
lang.isna().sum()

Given the exploration done in the lang dataset, we can conclude that for the next step in the cleaning and transformations, we wish to keep all the columns, but rename them. Moreover, to make sure that the dataset is homogeneous we will capitalize the first letter of each word in the "Country" and "Language Spoken" column. We can also see that there is no null values, hence no handling of missing values is needed in this case. In addition, we can see that the languages spoken includes more than the official languages, in the cleaning we will strive to only keep the official languages.

In [871]:
# Rename the columns
lang = lang.rename(columns = {"Languages Spoken": "Official Language"})

In [872]:
# Capitalise the first letter of each word in the Country and Official Language columns
lang["Country"] = lang["Country"].str.title()
lang["Country"] = lang["Country"].str.strip()
lang["Official Language"] = lang["Official Language"].str.title()

In [873]:
def language_cleaning(lan: str):
    """ This function is to clean the official languages, the rule is
    - If the official contains '(official)' in the text, adopt languages that come before '(official)
    - Otherwise take only first language"""

    # replace 'and' to comma
    # delete numbers and percentage
    lan = lan.replace(' And ', ', ')
    lan = re.sub("[\d%.]", "", lan)

    # first if the text contains 'official', we are going to keep all the official
    if 'Official' in lan:
        lan = lan.partition("(")[0].strip()

    # otherwise, we are going to keep first one as a main language spoken
    else:
        lan = lan.partition(",")[0].strip()

    return lan

In [874]:
# apply language cleaning function to the 'languages Spoken' column
lang['Official Language'] = lang['Official Language'].apply(language_cleaning)

In [875]:
lang

Unnamed: 0,Country,Official Language
0,Afghanistan,"Dari Persian, Pashtu"
1,Albania,Albanian
2,Algeria,Arabic
3,Andorra,Catalã¡N
4,Angola,Portuguese
...,...,...
193,Vietnam,Vietnamese
194,Western Sahara (Proposed State),Hassaniya Arabic
195,Yemen,Arabic
196,Zambia,English


#### Religion dataset

In [None]:
rel.head()

In [None]:
rel.info()

In [None]:
rel.isna().sum()

In [None]:
rel[rel[["Religion"]].isnull().any(axis=1)]

In [None]:
rel[rel[["Area"]].isnull().any(axis=1)]

In [None]:
rel[rel[["Sex"]].isnull().any(axis=1)]

Given the exploration done in the rel dataset, we can conclude that for the next step in the cleaning and transformations, we wish to keep the "Country or Area" and "Religion" columns but rename them. Moreover, to make sure that the dataset is homogeneous we will capitalize the first letter of each word in the "Country or Area" column. We can also see that there is multiple null values, that arise due to the existing footnotes at the end of the document, which will be dealt with. In addition, we can see that for each country there is the listing of all the religions present in the country. As we only wish to keep the main one the others will be removed.

In [876]:
# Remove the footnote rows, which is where the Religion column 
rel = rel[rel[["Religion"]].notna().any(axis=1)]

In [877]:
# Remove the rows with the total number of religious people. Also remove the other, not stated, and refused to answer religions
rel = rel[rel["Religion"].str.contains("Total|Other|Not Stated|Refused to answer") == False]

In [878]:
# Keep the data from the most recent year for each country
rel = rel.sort_values(by=["Country or Area", "Year"])
rel = rel.drop_duplicates(subset=['Country or Area', "Religion"], keep='last')

In [879]:
# Keep the rows with the highest value, meaning the main religion of each country
rel = rel.sort_values(by=["Country or Area", "Value"])
rel = rel.drop_duplicates(subset=['Country or Area'], keep='last')

In [880]:
# Select the columns to keep
rel = rel[["Country or Area", "Religion"]]

In [881]:
# Rename the columns
rel = rel.rename(columns = {"Country or Area": "Country", "Religion": "Main Religion"})

In [882]:
# Capitalise the first letter of each word in the Country and religion, and remove any extra spaces in the country column.
rel["Country"] = rel["Country"].str.title()
rel["Country"] = rel["Country"].str.strip()
rel["Main Religion"] = rel["Main Religion"].str.title()

#### Peace index dataset

In [None]:
peace.head()

In [None]:
peace.info()

In [None]:
peace.isna().sum()

In [None]:
peace[peace[["COUNTRY"]].isnull().any(axis=1)]

Given the exploration done in the peace dataset, we can conclude that for the next step in the cleaning and transformations, we wish to keep the "RANK", and "COUNTRY" columns, but rename them. Moreover, to make sure that the dataset is homogeneous we will capitalize the first letter of each word in the "COUNTRY" column. We can also see that there is multiple null values, that arise due to the existing of completely empty rows, which will be dealt with.

In [883]:
# Select the columns to keep
peace = peace[["RANK", "COUNTRY"]]

In [884]:
# Rename the columns
peace = peace.rename(columns = {"RANK": "Rank", "COUNTRY": "Country"})

In [885]:
# Capitalise the first letter of each word in the Country column
peace["Country"] = peace["Country"].str.title()
peace["Country"] = peace["Country"].str.strip()


In [886]:
# Drop null rows
peace = peace.dropna()

#### WEF Dataset

In [None]:
# to flatten the double header from excel
wef.columns=wef.columns.to_flat_index()

In [None]:
wef.head()

In [None]:
wef.info()

In [None]:
wef.isna().sum()

Among 143 columns we have in dataset, we will choose relevant columns and drop all the others. We found we have no null data in the dataset. Moreover, as the dataset have ISO code itself, there is no further cleaning needed for the country name.

In [887]:
# Rename columns
wef.rename(columns = {('Unnamed: 0_level_0', 'ISO Code'):'ISO Code',
                      ('Unnamed: 1_level_0', 'Economy'):'Country',
                      ('Unnamed: 2_level_0', 'Region'):'Continent',
                      ('Unnamed: 3_level_0', 'Sub Region'):'Sub Continent',
                      ('Unnamed: 4_level_0', 'Income Group'):'Income Group'
                      }, inplace = True)

In [888]:
# To check the index of the columns that we wish to drop
for i, col in enumerate(wef.columns):
    print(i, col)

0 ISO Code
1 Country
2 Continent
3 Sub Continent
4 Income Group
5 ('Travel & Tourism Development Index ', '2019 Value')
6 ('Travel & Tourism Development Index ', '2019 Rank')
7 ('Travel & Tourism Development Index ', '2021 Value')
8 ('Travel & Tourism Development Index ', '2021 Rank')
9 ('Travel & Tourism Development Index ', '% Dif Score')
10 ('Travel & Tourism Development Index ', 'Rank Change')
11 ('Enabling Environment subindex', '2019 Value')
12 ('Enabling Environment subindex', '2019 Rank')
13 ('Enabling Environment subindex', '2021 Value')
14 ('Enabling Environment subindex', '2021 Rank')
15 ('Enabling Environment subindex', '% Dif Score')
16 ('Enabling Environment subindex', 'Rank Change')
17 ('Travel and Tourism Policy and Enabling\nConditions subindex', '2019 Value')
18 ('Travel and Tourism Policy and Enabling\nConditions subindex', '2019 Rank')
19 ('Travel and Tourism Policy and Enabling\nConditions subindex', '2021 Value')
20 ('Travel and Tourism Policy and Enabling\nCondit

In [889]:
# Drop columns from the behind to prevent index changing
wef = wef.iloc[:, [0,1,2,3,4,7,8,25,26,49,50,55,56,79,80,85,86,109,110,115,116,127,128,139,140]]
wef.columns

Index([                                           'ISO Code',
                                                   'Country',
                                                 'Continent',
                                             'Sub Continent',
                                              'Income Group',
       ('Travel & Tourism Development Index ', '2021 Value'),
        ('Travel & Tourism Development Index ', '2021 Rank'),
                   ('Infrastructure subindex', '2021 Value'),
                    ('Infrastructure subindex', '2021 Rank'),
                ('Safety and Security pillar', '2021 Value'),
                 ('Safety and Security pillar', '2021 Rank'),
                 ('Health and Hygiene pillar', '2021 Value'),
                  ('Health and Hygiene pillar', '2021 Rank'),
             ('International Openness pillar', '2021 Value'),
              ('International Openness pillar', '2021 Rank'),
              ('Price competitiveness pillar', '2021 Value'),
        

##### Set new columns that classify each index into Very good, Good, Limited

In [890]:
# Infrastructure subindex
conditions = [(wef[ ('Infrastructure subindex', '2021 Rank')] <= 40), (wef[('Infrastructure subindex', '2021 Rank')] >= 80)]
choices = ["Very good", "Limited"]
wef["Infrastructure subindex, classification"] = np.select(conditions, choices, default = "Good")

In [891]:
# Safety and Security pillar
conditions = [(wef[('Safety and Security pillar', '2021 Rank')] <= 40), (wef[ ('Safety and Security pillar', '2021 Rank')] >= 80)]
choices = ["Very good", "Limited"]
wef["Safety and Security, Classification"] = np.select(conditions, choices, default = "Good")

In [892]:
# Health and Hygiene pillar
conditions = [(wef[('Health and Hygiene pillar', '2021 Rank')] <= 40), (wef[('Health and Hygiene pillar', '2021 Rank')] >= 80)]
choices = ["Very good", "Limited"]
wef["Health and Hygiene, Classification"] = np.select(conditions, choices, default = "Good")

In [893]:
# International Openness pillar
conditions = [(wef[('International Openness pillar', '2021 Rank')] <= 40), (wef[('International Openness pillar', '2021 Rank')] >= 80)]
choices = ["Very good", "Limited"]
wef["International Openess, Classification"] = np.select(conditions, choices, default = "Good")

In [894]:
# Price competitiveness pillar
conditions = [(wef[('Price competitiveness pillar', '2021 Rank')] <= 40), (wef[ ('Price competitiveness pillar', '2021 Rank')] >= 80)]
choices = ["Very good", "Limited"]
wef["Price competitiveness, Classification"] = np.select(conditions, choices, default = "Good")

In [895]:
# Natural Resources pillar
conditions = [(wef[('Natural Resources pillar', '2021 Rank')] <= 40), (wef[('Natural Resources pillar', '2021 Rank')] >= 80)]
choices = ["Very good", "Limited"]
wef["Natural Resources, Classification"] = np.select(conditions, choices, default = "Good")

In [896]:
# Cultural Resources pillar
conditions = [(wef[('Cultural Resources pillar', '2021 Rank')] <= 40), (wef[('Cultural Resources pillar', '2021 Rank')] >= 80)]
choices = ["Very good", "Limited"]
wef["Cultural Resources, Classification"] = np.select(conditions, choices, default = "Good")

In [897]:
# Environmental Sustainability pillar
conditions = [(wef[('Environmental Sustainability pillar', '2021 Rank')] <= 40), (wef[('Environmental Sustainability pillar', '2021 Rank')] >= 80)]
choices = ["Very good", "Limited"]
wef["Environmental Sustainability, Classification"] = np.select(conditions, choices, default = "Good")

In [898]:
# T&T Demand Pressure & Impact pillar
conditions = [(wef[('T&T Demand Pressure & Impact pillar', '2021 Rank')] <= 40), (wef[('T&T Demand Pressure & Impact pillar', '2021 Rank')] >= 80)]
choices = ["Very good", "Limited"]
wef["T&T Demand Pressure & Impact, Classification"] = np.select(conditions, choices, default = "Good")

#### Average Restaurant and Accomodation Cost Dataset

In [None]:
avg_rest_hot_p.head()

In [None]:
avg_rest_hot_p.info()

In [None]:
avg_rest_hot_p.isna().sum()

In [None]:
avg_rest_hot_p[avg_rest_hot_p[["Country Name"]].isnull().any(axis=1)]

In [None]:
avg_rest_hot_p[avg_rest_hot_p[["Country Code"]].isnull().any(axis=1)]

Given the exploration done in the avg_rest_hot_p dataset, we can conclude that for the next step in the cleaning and transformations, we wish to keep only the "Country Name", " Country Code", and the "2017 [YR2017]" columns, but we will rename them. Moreover, to make sure that the dataset is homogeneous we will capitalize the first letter of each word in the "Country Name" column. We can also see that there is 5 null values, which arise due to empty rows and footnotes, these will be handled accordingly. Additionally, we can see that some countries in the average price have ".." instead of being null, which will be solved.

In [899]:
# Select the columns we are interest in
avg_rest_hot_p = avg_rest_hot_p[["Country Name", "Country Code", "2017 [YR2017]"]]

In [900]:
# Rename the columns
avg_rest_hot_p = avg_rest_hot_p.rename(columns = {"Country Name": "Country", "2017 [YR2017]": "Average Hotel and Restaurant Cost"})

In [901]:
# Capitalise the first letter of each word in the Country column, and remove any extra spaces
avg_rest_hot_p["Country"] = avg_rest_hot_p["Country"].str.title()
avg_rest_hot_p["Country"] = avg_rest_hot_p["Country"].str.strip()

In [902]:
# Substitute ".." for null value
avg_rest_hot_p["Average Hotel and Restaurant Cost"] = avg_rest_hot_p["Average Hotel and Restaurant Cost"].replace("..","")

In [903]:
# Remove empty rows
avg_rest_hot_p = avg_rest_hot_p.dropna()

In [904]:
# Remove empty rows
avg_rest_hot_p = avg_rest_hot_p.dropna()

In [905]:
# Convert the column from string to numeric value
avg_rest_hot_p["Average Hotel and Restaurant Cost"] = pd.to_numeric(avg_rest_hot_p["Average Hotel and Restaurant Cost"], errors = "coerce")

In [906]:
# Round the numbers to two decimal points
avg_rest_hot_p["Average Hotel and Restaurant Cost"] = avg_rest_hot_p["Average Hotel and Restaurant Cost"].round(2)

In [907]:
# Set a new column that classifies countries into budget travel, comfortable travel and luxury travel
conditions = [(avg_rest_hot_p["Average Hotel and Restaurant Cost"] <= 70), ((avg_rest_hot_p["Average Hotel and Restaurant Cost"] > 70) & (avg_rest_hot_p["Average Hotel and Restaurant Cost"] <= 150)), (avg_rest_hot_p["Average Hotel and Restaurant Cost"] > 150)]
choices = ["Budget Traveler", "Comfort traveler", "Luxury Travel"]
avg_rest_hot_p["Type Traveler"] = np.select(conditions, choices, default = "Non Applicable")

In [None]:
lgbtq.head()

In [None]:
lgbtq.info()

#### LGBTQ Dataset

In [908]:
lgbtq['COUNTRY'] = lgbtq['COUNTRY'].str.title()

In [911]:
cuisine_rank.head()

Unnamed: 0,Country,Ratings
0,italy,4.72
1,greece,4.69
2,spain,4.59
3,japan,4.59
4,india,4.54


In [912]:
cuisine_rank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95 entries, 0 to 94
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  95 non-null     object 
 1   Ratings  95 non-null     float64
dtypes: float64(1), object(1)
memory usage: 1.6+ KB


#### Cuisine Rank Dataset

In [913]:
cuisine_rank['Country'] = cuisine_rank['Country'].str.replace('-', ' ')
cuisine_rank['Country'] = cuisine_rank['Country'].str.title()

Unnamed: 0,Country,Ratings
0,italy,4.72
1,greece,4.69
2,spain,4.59
3,japan,4.59
4,india,4.54
...,...,...
90,iceland,3.80
91,canada,3.79
92,latvia,3.79
93,morocco,3.69


In [910]:
cuisine_rank

SyntaxError: invalid syntax (861960806.py, line 2)

#### Airport distance - country

#### Check climate - country

In [None]:
clim = clim.merge(country[['Name','Iso3']], how='left', left_on='Country', right_on='Name')
clim.head()

In [None]:
# check the countries that are not merged
clim[clim['Name'].isnull()]

In [None]:
# drop the countries that are not merged
clim = clim.dropna(subset=['Name'])
# drop the merged country name column to avoid repetition
clim = clim.drop(['Name'], axis=1)

In [None]:
clim.head()

#### Check Currency - Country

In [None]:
curr = curr.merge(country[['Name','Iso3']], how='left', left_on='Country', right_on='Name')
curr.head()

In [None]:
curr[curr['Name'].isnull()]

In [None]:
curr = curr.dropna(subset= ['Name'])
curr = curr.drop(['Name'], axis=1)

#### Check language - country

In [None]:
lang = lang.merge(country[['Name','Iso3']], how='left', left_on='Country', right_on='Name')
lang.head()

In [None]:
lang[lang['Name'].isnull()]

In [None]:
lang = lang.dropna(subset=['Name'])
lang = lang.drop(['Name'], axis=1)

#### Check peace - country

In [None]:
peace=peace.merge(country[['Name','Iso3']], how='left', left_on='Country', right_on='Name')
peace.head()

In [None]:
peace[peace['Name'].isnull()]

In [None]:
peace=peace.dropna(subset=['Name'])
peace = peace.drop(['Name'], axis=1)

#### Check population - country

In [None]:
pop=pop.merge(country[['Name','Iso3']], how='left', left_on='Country', right_on='Name')
pop.head()

In [None]:
pop[pop['Name'].isnull()]

In [None]:
pop = pop.dropna(subset=['Name'])

In [None]:
# drop both country name and iso code from the merged dataset, as population data already contains ISO code
pop = pop.drop(['Name', 'Iso3'], axis=1)

#### Check Religion - country

In [None]:
rel=rel.merge(country[['Name','Iso3']], how='left', left_on='Country', right_on='Name')
rel.head()

In [None]:
rel[rel['Name'].isnull()]

In [None]:
rel = rel.dropna(subset=['Name'])
rel = rel.drop(['Name'], axis=1)

#### Check Average Restaurant and Accomodation Cost - ISO

In [None]:
avg_rest_hot_p = avg_rest_hot_p.merge(country[['Name','Iso3']], how='left', left_on='Country Code', right_on='Iso3')
avg_rest_hot_p.head()

In [None]:
avg_rest_hot_p [avg_rest_hot_p ['Name'].isnull()]

In [None]:
avg_rest_hot_p = avg_rest_hot_p.dropna(subset=['Name'])
# as the dataset contains both country name and country code, delete the merged columns
avg_rest_hot_p = avg_rest_hot_p.drop(['Name', 'Iso3'], axis=1)

#### Cuisine - Country

In [None]:
cuisine_temp = cuisine_rank.merge(country[['Name','Iso3']], how='left', left_on='Country', right_on='Name')
cuisine_temp.head()

In [None]:
cuisine_temp[cuisine_temp['Name'].isnull()]

As the United Kingdom was divided to England, Northern Ireland, Wales and Scotland,
we will have one row for the United Kingdom with the average ratings of the 4 countries
as we have the United Kingdom in the WEF datset.

In [None]:
# add average of 4 nations in the United Kingdom to the existing row
uk_rank = cuisine_rank.iloc[[28,68,79,87]]['Ratings'].mean()
cuisine_rank.loc[len(cuisine_rank.index)] = ['United Kingdom', uk_rank]

In [None]:
# Then sort by ratings, and reset index
cuisine_rank.sort_values(by=['Ratings'], ignore_index=True, inplace=True, ascending=False)

In [None]:
# merge again with modified data
cuisine_rank = cuisine_rank.merge(country[['Name','Iso3']], how='left', left_on='Country', right_on='Name')
# check UK
cuisine_rank[cuisine_rank['Country']=='United Kingdom']

In [None]:
cuisine_rank[cuisine_rank['Name'].isnull()]

In [None]:
cuisine_rank = cuisine_rank.dropna(subset=['Name'])
cuisine_rank = cuisine_rank.drop(['Name'], axis=1)

#### LGBT data - Country

In [None]:
lgbtq = lgbtq.merge(country[['Name', 'Iso3']], how='left', left_on='COUNTRY', right_on='Name')
lgbtq.head()

In [None]:
lgbtq[lgbtq['Name'].isnull()]

In [None]:
lgbtq=lgbtq.dropna(subset=['Name'])
lgbtq = lgbtq.drop(['Name'], axis=1)

In [None]:
country.to_csv('../cleaned_data/country.csv')

pop.to_csv('../cleaned_data/population.csv')

clim.to_csv('../cleaned_data/climate.csv')

curr.to_csv('../cleaned_data/currencies.csv')

lang.to_csv('../cleaned_data/language.csv')

rel.to_csv('../cleaned_data/religion.csv')

peace.to_csv('../cleaned_data/peace_index.csv')

wef.to_csv('../cleaned_data/wef_ttdi.csv')

avg_rest_hot_p.to_csv("../cleaned_data/avg_rest_hot_p.csv")

lgbtq.to_csv("../cleaned_data/lgbtq.csv")

cuisine_rank.to_csv('../cleaned_data/cuisine_rank.csv')
