In [2]:
import pandas as pd
import numpy as np 

# Splitting

In [3]:
df = pd.read_csv("census.csv")
df = df[df["SUMLEV"] == 50]
print(df.head())

   SUMLEV  REGION  DIVISION  STATE  COUNTY   STNAME         CTYNAME  \
1      50       3         6      1       1  Alabama  Autauga County   
2      50       3         6      1       3  Alabama  Baldwin County   
3      50       3         6      1       5  Alabama  Barbour County   
4      50       3         6      1       7  Alabama     Bibb County   
5      50       3         6      1       9  Alabama   Blount County   

   CENSUS2010POP  ESTIMATESBASE2010  POPESTIMATE2010  ...  RDOMESTICMIG2011  \
1          54571              54571            54660  ...          7.242091   
2         182265             182265           183193  ...         14.832960   
3          27457              27457            27341  ...         -4.728132   
4          22915              22919            22861  ...         -5.527043   
5          57322              57322            57373  ...          1.807375   

   RDOMESTICMIG2012  RDOMESTICMIG2013  RDOMESTICMIG2014  RDOMESTICMIG2015  \
1         -2.915927  

In [4]:
%%timeit -n 3

for state in df["STNAME"].unique():
    average = np.average(df.where(df["STNAME"] == state).dropna()["CENSUS2010POP"])
    print(f"Counties in state {state} have an average population of {average})")


Counties in state Alabama have an average population of 71339.34328358209)
Counties in state Alaska have an average population of 24490.724137931036)
Counties in state Arizona have an average population of 426134.4666666667)
Counties in state Arkansas have an average population of 38878.90666666667)
Counties in state California have an average population of 642309.5862068966)
Counties in state Colorado have an average population of 78581.1875)
Counties in state Connecticut have an average population of 446762.125)
Counties in state Delaware have an average population of 299311.3333333333)
Counties in state District of Columbia have an average population of 601723.0)
Counties in state Florida have an average population of 280616.5671641791)
Counties in state Georgia have an average population of 60928.63522012578)
Counties in state Hawaii have an average population of 272060.2)
Counties in state Idaho have an average population of 35626.86363636364)
Counties in state Illinois have an av

In [5]:
%%timeit -n 3

for group, frame in df.groupby("STNAME"):
    average = np.average(frame["CENSUS2010POP"])
    print(f"Counties in state {group} have an average population of {average}")
# ~160 times faster than the other approach

Counties in state Alabama have an average population of 71339.34328358209
Counties in state Alaska have an average population of 24490.724137931036
Counties in state Arizona have an average population of 426134.4666666667
Counties in state Arkansas have an average population of 38878.90666666667
Counties in state California have an average population of 642309.5862068966
Counties in state Colorado have an average population of 78581.1875
Counties in state Connecticut have an average population of 446762.125
Counties in state Delaware have an average population of 299311.3333333333
Counties in state District of Columbia have an average population of 601723.0
Counties in state Florida have an average population of 280616.5671641791
Counties in state Georgia have an average population of 60928.63522012578
Counties in state Hawaii have an average population of 272060.2
Counties in state Idaho have an average population of 35626.86363636364
Counties in state Illinois have an average populat

In [6]:
# kind of a hash function in order to use groupby with 3 "groups", one at a time
def set_batch_number(item):
    if item[0] < 'M':
        return 0
    if item[0] < 'Q':
        return 1
    return 2

df = df.set_index("STNAME")

# if no column is given to .groupby() it automatically uses the index
for group, frame in df.groupby(set_batch_number):
    print(f"There are {len(frame)} records in group {group} for processing")

There are 1177 records in group 0 for processing
There are 1134 records in group 1 for processing
There are 831 records in group 2 for processing


In [22]:
df = pd.read_csv("listings.csv")
print(df.head())

         id                            listing_url       scrape_id  \
0  12147973  https://www.airbnb.com/rooms/12147973  20160906204935   
1   3075044   https://www.airbnb.com/rooms/3075044  20160906204935   
2      6976      https://www.airbnb.com/rooms/6976  20160906204935   
3   1436513   https://www.airbnb.com/rooms/1436513  20160906204935   
4   7651065   https://www.airbnb.com/rooms/7651065  20160906204935   

  last_scraped                                           name  \
0   2016-09-07                     Sunny Bungalow in the City   
1   2016-09-07              Charming room in pet friendly apt   
2   2016-09-07               Mexican Folk Art Haven in Boston   
3   2016-09-07  Spacious Sunny Bedroom Suite in Historic Home   
4   2016-09-07                            Come Home to Boston   

                                             summary  \
0  Cozy, sunny, family home.  Master bedroom high...   
1  Charming and quiet room in a second floor 1910...   
2  Come stay with a 

In [23]:
df = df.set_index(["cancellation_policy", "review_scores_value"])

# when there is a multi index, we have to determine the level of importance of each of them 
for group, frame in df.groupby(level=(0, 1)):
    print(group)

('flexible', 2.0)
('flexible', 4.0)
('flexible', 5.0)
('flexible', 6.0)
('flexible', 7.0)
('flexible', 8.0)
('flexible', 9.0)
('flexible', 10.0)
('moderate', 2.0)
('moderate', 4.0)
('moderate', 6.0)
('moderate', 7.0)
('moderate', 8.0)
('moderate', 9.0)
('moderate', 10.0)
('strict', 2.0)
('strict', 3.0)
('strict', 4.0)
('strict', 5.0)
('strict', 6.0)
('strict', 7.0)
('strict', 8.0)
('strict', 9.0)
('strict', 10.0)
('super_strict_30', 6.0)
('super_strict_30', 7.0)
('super_strict_30', 8.0)
('super_strict_30', 9.0)
('super_strict_30', 10.0)


In [24]:
def grouping_function(item): # separating the 10s
    if item[1] == 10.0:
        return (item[0], "10.0")
    return (item[0], "not 10.0")

for group, frame in df.groupby(by=grouping_function):
    print(group)

('flexible', '10.0')
('flexible', 'not 10.0')
('moderate', '10.0')
('moderate', 'not 10.0')
('strict', '10.0')
('strict', 'not 10.0')
('super_strict_30', '10.0')
('super_strict_30', 'not 10.0')


In [25]:
print(df.head())

                                               id  \
cancellation_policy review_scores_value             
moderate            NaN                  12147973   
                    9.0                   3075044   
                    10.0                     6976   
                    10.0                  1436513   
flexible            10.0                  7651065   

                                                                   listing_url  \
cancellation_policy review_scores_value                                          
moderate            NaN                  https://www.airbnb.com/rooms/12147973   
                    9.0                   https://www.airbnb.com/rooms/3075044   
                    10.0                     https://www.airbnb.com/rooms/6976   
                    10.0                  https://www.airbnb.com/rooms/1436513   
flexible            10.0                  https://www.airbnb.com/rooms/7651065   

                                              scrape_

# Aggregation

In [26]:
df.reset_index(inplace=True)
print(df.groupby("cancellation_policy").agg({"review_scores_value": np.average}))
# what comes inside the .agg() functions is the columns we want to aggregate
# does not work since np.average does not ignore nan values

                     review_scores_value
cancellation_policy                     
flexible                             NaN
moderate                             NaN
strict                               NaN
super_strict_30                      NaN


In [27]:
print(df.groupby("cancellation_policy").agg({"review_scores_value": np.nanmean}))
# np.nanmean is the same thing as the np.average but it ignores nan values

                     review_scores_value
cancellation_policy                     
flexible                        9.237421
moderate                        9.307398
strict                          9.081441
super_strict_30                 8.537313


  print(df.groupby("cancellation_policy").agg({"review_scores_value": np.nanmean}))


In [28]:
print(df.groupby("cancellation_policy").agg({"review_scores_value": (np.nanmean, np.nanstd), "reviews_per_month": np.nanmean}))
# nanmean and nanstd will be like a sub-column
# it groups by cancellation policy and then incorporates the columns specified

                    review_scores_value           reviews_per_month
                                nanmean    nanstd           nanmean
cancellation_policy                                                
flexible                       9.237421  1.096271          1.829210
moderate                       9.307398  0.859859          2.391922
strict                         9.081441  1.040531          1.873467
super_strict_30                8.537313  0.840785          0.340143


  print(df.groupby("cancellation_policy").agg({"review_scores_value": (np.nanmean, np.nanstd), "reviews_per_month": np.nanmean}))
  print(df.groupby("cancellation_policy").agg({"review_scores_value": (np.nanmean, np.nanstd), "reviews_per_month": np.nanmean}))
  print(df.groupby("cancellation_policy").agg({"review_scores_value": (np.nanmean, np.nanstd), "reviews_per_month": np.nanmean}))


# Transformation

In [29]:
columns = ["cancellation_policy", "review_scores_value"]
transform_df = df[columns].groupby("cancellation_policy").transform(np.nanmean)
print(transform_df.head())

   review_scores_value
0             9.307398
1             9.307398
2             9.307398
3             9.307398
4             9.237421


  transform_df = df[columns].groupby("cancellation_policy").transform(np.nanmean)


In [30]:
transform_df.rename({"review_scores_value": "mean_review_scores"}, axis="columns", inplace=True)
df = df.merge(transform_df, left_index=True, right_index=True)
print(df.head())

  cancellation_policy  review_scores_value        id  \
0            moderate                  NaN  12147973   
1            moderate                  9.0   3075044   
2            moderate                 10.0      6976   
3            moderate                 10.0   1436513   
4            flexible                 10.0   7651065   

                             listing_url       scrape_id last_scraped  \
0  https://www.airbnb.com/rooms/12147973  20160906204935   2016-09-07   
1   https://www.airbnb.com/rooms/3075044  20160906204935   2016-09-07   
2      https://www.airbnb.com/rooms/6976  20160906204935   2016-09-07   
3   https://www.airbnb.com/rooms/1436513  20160906204935   2016-09-07   
4   https://www.airbnb.com/rooms/7651065  20160906204935   2016-09-07   

                                            name  \
0                     Sunny Bungalow in the City   
1              Charming room in pet friendly apt   
2               Mexican Folk Art Haven in Boston   
3  Spacious Sunn

In [31]:
df["mean_diff"] = np.absolute(df["review_scores_value"] - df["mean_review_scores"])
print(df["mean_diff"].head())

0         NaN
1    0.307398
2    0.692602
3    0.692602
4    0.762579
Name: mean_diff, dtype: float64


# Filtering

In [32]:
print(df.groupby("cancellation_policy").filter(lambda x: np.nanmean(x["review_scores_value"]) > 9.2))
# here we select the rows which have a review_scores value > 9.2

     cancellation_policy  review_scores_value        id  \
0               moderate                  NaN  12147973   
1               moderate                  9.0   3075044   
2               moderate                 10.0      6976   
3               moderate                 10.0   1436513   
4               flexible                 10.0   7651065   
...                  ...                  ...       ...   
3576            flexible                  NaN  14689681   
3577            flexible                  NaN  13750763   
3579            flexible                  NaN  14852179   
3582            flexible                  NaN  14585486   
3584            flexible                  NaN  14504422   

                                listing_url       scrape_id last_scraped  \
0     https://www.airbnb.com/rooms/12147973  20160906204935   2016-09-07   
1      https://www.airbnb.com/rooms/3075044  20160906204935   2016-09-07   
2         https://www.airbnb.com/rooms/6976  20160906204935   2

# Applying

In [33]:
df = pd.read_csv("listings.csv")
df = df[["cancellation_policy", "review_scores_value"]]
print(df.head())

  cancellation_policy  review_scores_value
0            moderate                  NaN
1            moderate                  9.0
2            moderate                 10.0
3            moderate                 10.0
4            flexible                 10.0


In [37]:
def calc_mean_review_scores(group):
    average = np.nanmean(group["review_scores_value"])
    group["review_scores_mean"] = np.abs(average - group["review_scores_value"])
    return group

print(df.groupby("cancellation_policy").apply(calc_mean_review_scores).head())
# it is slow but if the df is not large, it is very useful

                       cancellation_policy  review_scores_value  \
cancellation_policy                                               
flexible            4             flexible                 10.0   
                    5             flexible                 10.0   
                    10            flexible                 10.0   
                    11            flexible                  9.0   
                    12            flexible                 10.0   

                        review_scores_mean  
cancellation_policy                         
flexible            4             0.762579  
                    5             0.762579  
                    10            0.762579  
                    11            0.237421  
                    12            0.762579  
