## Importing packages

In [1]:
import pandas as pd

## Importing data and creating dataFrames

In [2]:
paymentModeData = pd.read_csv('chefmozaccepts.csv')
cuisineData = pd.read_csv('chefmozcuisine.csv')
hoursData = pd.read_csv('chefmozhours4.csv')
parkingData = pd.read_csv('chefmozparking.csv')
geoPlaceData = pd.read_csv('geoplaces2.csv')

In [3]:
userCuisineData = pd.read_csv('usercuisine.csv')
# Renaming columns
userCuisineData.columns = ['user_ID', 'Rcuisine']

userPaymentData = pd.read_csv('userpayment.csv')
userProfileData = pd.read_csv('userprofile.csv')
userRatingData = pd.read_csv('rating_final.csv')

## Sorting datasets

In [4]:
paymentModeData.sort_values('placeID', inplace=True)
cuisineData = cuisineData.sort_values('placeID')
hoursData = hoursData.sort_values('placeID')
parkingData = parkingData.sort_values('placeID')
geoPlaceData.sort_values('placeID', inplace=True)

In [5]:
userCuisineData.sort_values('user_ID', inplace=True)
userPaymentData.sort_values('userID', inplace=True)
userProfileData.sort_values('userID', inplace=True)
userRatingData.sort_values('userID', inplace=True)

In [6]:
print(paymentModeData.head(1))
print('------------')
print(cuisineData.head(1))
print('------------')
print(hoursData.head(1))
print('------------')
print(parkingData.head(1))
print('------------')
print(geoPlaceData.head(1))

      placeID     Rpayment
1313   132002  Diners_Club
------------
     placeID       Rcuisine
915   132001  Dutch-Belgian
------------
      placeID         hours  days
2338   132012  12:00-22:00;  Sat;
------------
     placeID parking_lot
701   132012      street
------------
     placeID   latitude  longitude  \
106   132560  23.752304 -99.166913   

                                        the_geom_meter                name  \
106  0101000020957F0000FC60BDA8E88157C1B2C357D6DA4E...  puesto de gorditas   

                   address      city       state country fax      ...        \
106  frente al tecnologico  victoria  tamaulipas  mexico   ?      ...         

               alcohol smoking_area dress_code     accessibility price url  \
106  No_Alcohol_Served    permitted   informal  no_accessibility   low   ?   

    Rambience franchise  area other_services  
106  familiar         f  open           none  

[1 rows x 21 columns]


In [7]:
print(userCuisineData.head(1))
print('------------')
print(userPaymentData.head(1))
print('------------')
print(userProfileData.head(1))
print('------------')
print(userRatingData.head(1))

  user_ID  Rcuisine
0   U1001  American
------------
  userID Upayment
0  U1001     cash
------------
  userID   latitude   longitude smoker drink_level dress_preference ambience  \
0  U1001  22.139997 -100.978803  false  abstemious         informal   family   

  transport marital_status        hijos  birth_year interest  \
0   on foot         single  independent        1989  variety   

         personality religion activity  color  weight  budget  height  
0  thrifty-protector     none  student  black      69  medium    1.77  
------------
    userID  placeID  rating  food_rating  service_rating
691  U1001   135045       1            1               1


## Analyzing data
From the code shown below we got to know that, there are users who have tried multiple cuisine

In [8]:
print(userCuisineData[userCuisineData.user_ID == 'U1004'])

   user_ID              Rcuisine
11   U1004             Cafeteria
10   U1004  Continental-European
8    U1004                Bagels
7    U1004               Mexican
9    U1004      Cafe-Coffee_Shop
5    U1004              Japanese
4    U1004      Breakfast-Brunch
3    U1004                Bakery
6    U1004          Contemporary


## Append Datasets

## Concat Datasets

## Join Datasets

## Merging Datasets
NOTE: While merging second dataset we have mentioned additional 'on' attribute as well.

In [9]:
userBy_payment_rating_merged = pd.merge(userPaymentData, 
                                                userRatingData, on='userID')
print(userBy_payment_rating_merged.head())

  userID Upayment  placeID  rating  food_rating  service_rating
0  U1001     cash   135045       1            1               1
1  U1001     cash   135039       1            1               1
2  U1001     cash   135025       2            2               2
3  U1001     cash   135051       1            1               2
4  U1001     cash   132825       2            2               1


In [10]:
userBy_cuisine_payment_rating_merged = pd.merge(userCuisineData, userBy_payment_rating_merged, 
                                                left_on='user_ID', right_on='userID')
userBy_cuisine_payment_rating_merged.sort_values('userID', inplace=True)
userBy_cuisine_payment_rating_merged.drop('userID', axis=1, inplace=True)

top_rated_places_users = userBy_cuisine_payment_rating_merged[
    (userBy_cuisine_payment_rating_merged.rating == 2) &
    (userBy_cuisine_payment_rating_merged.food_rating == 2) & 
    (userBy_cuisine_payment_rating_merged.service_rating == 2)]

print(top_rated_places_users.head())

   user_ID  Rcuisine Upayment  placeID  rating  food_rating  service_rating
2    U1001  American     cash   135025       2            2               2
13   U1002   Mexican     cash   132862       2            2               2
31   U1003   Mexican     cash   132754       2            2               2
30   U1003   Mexican     cash   135059       2            2               2
29   U1003   Mexican     cash   132922       2            2               2


## Merging on mutiple columns
Users rating for the cuisine available at the place ID

In [11]:
# Default merge is inner merge
combined_user_place_data = pd.merge(top_rated_places_users, cuisineData, 
         left_on=['placeID', 'Rcuisine'], right_on=['placeID', 'Rcuisine'])
print(combined_user_place_data.head())
print(combined_user_place_data.info())

  user_ID Rcuisine          Upayment  placeID  rating  food_rating  \
0   U1003  Mexican              cash   132754       2            2   
1   U1036  Mexican              cash   132754       2            2   
2   U1059  Mexican              cash   132754       2            2   
3   U1061  Mexican  bank_debit_cards   132754       2            2   
4   U1061  Mexican              cash   132754       2            2   

   service_rating  
0               2  
1               2  
2               2  
3               2  
4               2  
<class 'pandas.core.frame.DataFrame'>
Int64Index: 81 entries, 0 to 80
Data columns (total 7 columns):
user_ID           81 non-null object
Rcuisine          81 non-null object
Upayment          81 non-null object
placeID           81 non-null int64
rating            81 non-null int64
food_rating       81 non-null int64
service_rating    81 non-null int64
dtypes: int64(4), object(3)
memory usage: 5.1+ KB
None


In [12]:
# Distinct users rated to the distinct resturants
print(combined_user_place_data.user_ID.value_counts())
print(combined_user_place_data.placeID.value_counts())

U1004    10
U1116     9
U1108     6
U1077     3
U1036     3
U1071     3
U1137     3
U1078     3
U1109     2
U1093     2
U1103     2
U1048     2
U1096     2
U1132     2
U1061     2
U1107     2
U1136     2
U1085     2
U1099     2
U1028     2
U1056     2
U1026     2
U1003     1
U1060     1
U1080     1
U1067     1
U1083     1
U1123     1
U1030     1
U1134     1
U1059     1
U1126     1
U1054     1
U1095     1
U1016     1
Name: user_ID, dtype: int64
132825    12
135025     7
135106     6
132834     6
135028     6
132723     5
132754     5
135032     5
135055     4
132613     3
132584     3
132755     3
135075     3
132954     2
134999     2
132665     2
132630     1
135035     1
135104     1
135027     1
135018     1
132717     1
132608     1
Name: placeID, dtype: int64


In [13]:
# from above we found out that most rated resturant is '132825'
# Let's get detail of above resturant:
print(geoPlaceData[geoPlaceData.placeID == 132825])

   placeID   latitude   longitude  \
1   132825  22.147392 -100.983092   

                                      the_geom_meter             name  \
1  0101000020957F00001AD016568C4858C1243261274BA5...  puesto de tacos   

                                  address    city   state country fax  \
1  esquina santos degollado y leon guzman  s.l.p.  s.l.p.  mexico   ?   

       ...                  alcohol smoking_area dress_code accessibility  \
1      ...        No_Alcohol_Served         none   informal    completely   

  price url Rambience franchise  area other_services  
1   low   ?  familiar         f  open           none  

[1 rows x 21 columns]


## Other Examples
1. Performing right merge
2. Using on clause instead of right_on and left_on
3. Using suffixes attribute
4. Using merge_ordered and merge_asof function to merge datasets

In [14]:
# We can even perform outer, left or right merge as well
# Let's perform right merge now
combined_user_place_left_data = pd.merge(top_rated_places_users, cuisineData, 
         left_on=['placeID', 'Rcuisine'], right_on=['placeID', 'Rcuisine'], how='right')
print(combined_user_place_left_data.head())
print(combined_user_place_left_data.info())

  user_ID Rcuisine          Upayment  placeID  rating  food_rating  \
0   U1003  Mexican              cash   132754     2.0          2.0   
1   U1036  Mexican              cash   132754     2.0          2.0   
2   U1059  Mexican              cash   132754     2.0          2.0   
3   U1061  Mexican  bank_debit_cards   132754     2.0          2.0   
4   U1061  Mexican              cash   132754     2.0          2.0   

   service_rating  
0             2.0  
1             2.0  
2             2.0  
3             2.0  
4             2.0  
<class 'pandas.core.frame.DataFrame'>
Int64Index: 973 entries, 0 to 972
Data columns (total 7 columns):
user_ID           81 non-null object
Rcuisine          973 non-null object
Upayment          81 non-null object
placeID           973 non-null int64
rating            81 non-null float64
food_rating       81 non-null float64
service_rating    81 non-null float64
dtypes: float64(3), int64(1), object(3)
memory usage: 60.8+ KB
None


In [15]:
# If column names are duplicate we can avoid right_on and left_on
# Instead we can directly use on attribute
combined_user_place_on_data = pd.merge(top_rated_places_users, cuisineData, 
                                       on=['placeID', 'Rcuisine'], how='right')
print(combined_user_place_on_data.head())
print(combined_user_place_on_data.info())

  user_ID Rcuisine          Upayment  placeID  rating  food_rating  \
0   U1003  Mexican              cash   132754     2.0          2.0   
1   U1036  Mexican              cash   132754     2.0          2.0   
2   U1059  Mexican              cash   132754     2.0          2.0   
3   U1061  Mexican  bank_debit_cards   132754     2.0          2.0   
4   U1061  Mexican              cash   132754     2.0          2.0   

   service_rating  
0             2.0  
1             2.0  
2             2.0  
3             2.0  
4             2.0  
<class 'pandas.core.frame.DataFrame'>
Int64Index: 973 entries, 0 to 972
Data columns (total 7 columns):
user_ID           81 non-null object
Rcuisine          973 non-null object
Upayment          81 non-null object
placeID           973 non-null int64
rating            81 non-null float64
food_rating       81 non-null float64
service_rating    81 non-null float64
dtypes: float64(3), int64(1), object(3)
memory usage: 60.8+ KB
None


In [16]:
combined_user_place_suffixes_data = pd.merge_ordered(top_rated_places_users, cuisineData, 
         on=['placeID'],suffixes=['_places','_cuisine'], fill_method='ffill')
print(combined_user_place_suffixes_data.tail())
print(combined_user_place_suffixes_data.info())

     user_ID Rcuisine_places Upayment  placeID  rating  food_rating  \
1671   U1126         Mexican     cash   135106     2.0          2.0   
1672   U1126         Mexican     cash   135107     2.0          2.0   
1673   U1037         Mexican     cash   135108     2.0          2.0   
1674   U1037         Mexican     cash   135109     2.0          2.0   
1675   U1037         Mexican     cash   135110     2.0          2.0   

      service_rating Rcuisine_cuisine  
1671             2.0          Mexican  
1672             2.0   Latin_American  
1673             2.0   Latin_American  
1674             2.0          Italian  
1675             2.0          Spanish  
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1676 entries, 0 to 1675
Data columns (total 8 columns):
user_ID             1272 non-null object
Rcuisine_places     1272 non-null object
Upayment            1272 non-null object
placeID             1676 non-null int64
rating              1272 non-null float64
food_rating         12

In [17]:
top_rated_places_users.is_copy = False
top_rated_places_users.sort_values('placeID', inplace=True)

combined_user_place_suffixes_data = pd.merge_asof(top_rated_places_users, cuisineData, 
         on=['placeID'],suffixes=['_places','_cuisine'])
display(combined_user_place_suffixes_data.head())
print(combined_user_place_suffixes_data.info())

Unnamed: 0,user_ID,Rcuisine_places,Upayment,placeID,rating,food_rating,service_rating,Rcuisine_cuisine
0,U1060,Tex-Mex,cash,132564,2,2,2,Regional
1,U1060,American,cash,132564,2,2,2,Regional
2,U1060,Spanish,cash,132564,2,2,2,Regional
3,U1060,Cafe-Coffee_Shop,cash,132564,2,2,2,Regional
4,U1060,Burgers,cash,132564,2,2,2,Regional


<class 'pandas.core.frame.DataFrame'>
Int64Index: 753 entries, 0 to 752
Data columns (total 8 columns):
user_ID             753 non-null object
Rcuisine_places     753 non-null object
Upayment            753 non-null object
placeID             753 non-null int64
rating              753 non-null int64
food_rating         753 non-null int64
service_rating      753 non-null int64
Rcuisine_cuisine    753 non-null object
dtypes: int64(4), object(4)
memory usage: 52.9+ KB
None
