## Importing packages

In [None]:
import pandas as pd

## Importing data and creating dataFrames

In [None]:
paymentModeData = pd.read_csv('chefmozaccepts.csv')
cuisineData = pd.read_csv('chefmozcuisine.csv')
hoursData = pd.read_csv('chefmozhours4.csv')
parkingData = pd.read_csv('chefmozparking.csv')
geoPlaceData = pd.read_csv('geoplaces2.csv')

In [None]:
userCuisineData = pd.read_csv('usercuisine.csv')
# Renaming columns
userCuisineData.columns = ['user_ID', 'Rcuisine']

userPaymentData = pd.read_csv('userpayment.csv')
userProfileData = pd.read_csv('userprofile.csv')
userRatingData = pd.read_csv('rating_final.csv')

In [None]:
englishScoreData = pd.read_csv('merge_concat_join/english_marks.csv', 
                               index_col='Student')
geographyScoreData = pd.read_csv('merge_concat_join/geography_marks.csv', 
                                 index_col='Student')
mathsScoreData = pd.read_csv('merge_concat_join/maths_marks.csv', 
                             index_col='Student')

## Sorting datasets

In [None]:
paymentModeData.sort_values('placeID', inplace=True)
cuisineData = cuisineData.sort_values('placeID')
hoursData = hoursData.sort_values('placeID')
parkingData = parkingData.sort_values('placeID')
geoPlaceData.sort_values('placeID', inplace=True)

In [None]:
userCuisineData.sort_values('user_ID', inplace=True)
userPaymentData.sort_values('userID', inplace=True)
userProfileData.sort_values('userID', inplace=True)
userRatingData.sort_values('userID', inplace=True)

In [None]:
print(paymentModeData.head(1))
print('------------')
print(cuisineData.head(1))
print('------------')
print(hoursData.head(1))
print('------------')
print(parkingData.head(1))
print('------------')
print(geoPlaceData.head(1))

In [None]:
print(userCuisineData.head(1))
print('------------')
print(userPaymentData.head(1))
print('------------')
print(userProfileData.head(1))
print('------------')
print(userRatingData.head(1))

## Analyzing data
From the code shown below we got to know that, there are users who have tried multiple cuisine

In [None]:
print(userCuisineData[userCuisineData.user_ID == 'U1004'])

## Append Datasets

In [None]:
display(englishScoreData.append(geographyScoreData).append(mathsScoreData))

In [None]:
display(englishScoreData.append(geographyScoreData, ignore_index=True).
        append(mathsScoreData, ignore_index=True))

## Concat Datasets

In [None]:
## Row wise concatination
display(pd.concat([englishScoreData, geographyScoreData], axis= 0).reset_index())

In [None]:
## Row wise concatination
display(pd.concat([englishScoreData, geographyScoreData], axis= 1))

In [None]:
display(pd.concat([englishScoreData, geographyScoreData, mathsScoreData], keys=[
    'English', 'Geography', 'Maths'], axis= 1, join='inner'))

In [None]:
## Multi-level row index
studentResult = pd.concat([geographyScoreData, englishScoreData, mathsScoreData],keys=[
    'Geography', 'English', 'Maths']).sort_index(level=0)
display(studentResult)

In [None]:
# Get list of selected subjects
display(studentResult.loc[['English', 'Geography']])

In [None]:
## Get all marks of particular student
idx = pd.IndexSlice
display(studentResult.loc[idx[:,'Joy'], :])

## Join Datasets

In [None]:
# Joining using right and left suffix
display(geographyScoreData.join(englishScoreData, lsuffix='_geography', rsuffix='_english'))

In [None]:
# Inner join on the common key
display(userPaymentData.join(userRatingData.set_index('userID'), lsuffix='_userPayment', 
                             rsuffix='_userRating', on='userID', how='inner').dropna().head())

## Merging Datasets
NOTE: While merging second dataset we have mentioned additional 'on' attribute as well.

In [None]:
userBy_payment_rating_merged = pd.merge(userPaymentData, 
                                                userRatingData, on='userID')
print(userBy_payment_rating_merged.head())

In [None]:
userBy_cuisine_payment_rating_merged = pd.merge(userCuisineData, userBy_payment_rating_merged, 
                                                left_on='user_ID', right_on='userID')
userBy_cuisine_payment_rating_merged.sort_values('userID', inplace=True)
userBy_cuisine_payment_rating_merged.drop('userID', axis=1, inplace=True)

top_rated_places_users = userBy_cuisine_payment_rating_merged[
    (userBy_cuisine_payment_rating_merged.rating == 2) &
    (userBy_cuisine_payment_rating_merged.food_rating == 2) & 
    (userBy_cuisine_payment_rating_merged.service_rating == 2)]

print(top_rated_places_users.head())

## Merging on mutiple columns
Users rating for the cuisine available at the place ID

In [None]:
# Default merge is inner merge
combined_user_place_data = pd.merge(top_rated_places_users, cuisineData, 
         left_on=['placeID', 'Rcuisine'], right_on=['placeID', 'Rcuisine'])
print(combined_user_place_data.head())
print(combined_user_place_data.info())

In [None]:
# Distinct users rated to the distinct resturants
print(combined_user_place_data.user_ID.value_counts())
print(combined_user_place_data.placeID.value_counts())

In [None]:
# from above we found out that most rated resturant is '132825'
# Let's get detail of above resturant:
print(geoPlaceData[geoPlaceData.placeID == 132825])

## Other Examples
1. Performing right merge
2. Using on clause instead of right_on and left_on
3. Using suffixes attribute
4. Using merge_ordered and merge_asof function to merge datasets

In [None]:
# We can even perform outer, left or right merge as well
# Let's perform right merge now
combined_user_place_left_data = pd.merge(top_rated_places_users, cuisineData, 
         left_on=['placeID', 'Rcuisine'], right_on=['placeID', 'Rcuisine'], how='right')
print(combined_user_place_left_data.head())
print(combined_user_place_left_data.info())

In [None]:
# If column names are duplicate we can avoid right_on and left_on
# Instead we can directly use on attribute
combined_user_place_on_data = pd.merge(top_rated_places_users, cuisineData, 
                                       on=['placeID', 'Rcuisine'], how='right')
print(combined_user_place_on_data.head())
print(combined_user_place_on_data.info())

In [None]:
combined_user_place_suffixes_data = pd.merge_ordered(top_rated_places_users, cuisineData, 
         on=['placeID'],suffixes=['_places','_cuisine'], fill_method='ffill')
print(combined_user_place_suffixes_data.tail())
print(combined_user_place_suffixes_data.info())

In [None]:
#top_rated_places_users.is_copy = False
top_rated_places_users.sort_values('placeID', inplace=True)

combined_user_place_suffixes_data = pd.merge_asof(top_rated_places_users, cuisineData, 
         on=['placeID'],suffixes=['_places','_cuisine'])
display(combined_user_place_suffixes_data.head())
print(combined_user_place_suffixes_data.info())