# Descriptive Statistics Assignment

In [None]:
import pandas as pd
from scipy import stats

In [None]:
data_url = "http://data.insideairbnb.com/united-states/ny/new-york-city/2020-06-08/visualisations/listings.csv"

### Read the aggregated AirBnB listings of New York (using `data_url`) into a Pandas data frame.

In [None]:
df = pd.read_csv(data_url)

### Print the data type for each column.

In [None]:
df.dtypes

id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

### Generate a basic set of descriptive statistics for the numeric fields in the data set.

In [None]:
df.head(5)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2060,Modern NYC,2259,Jenny,Manhattan,Washington Heights,40.85722,-73.9379,Private room,100,1,1,2008-09-22,0.01,1,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,3,48,2019-11-04,0.37,2,335
2,3831,"Whole flr w/private bdrm, bath & kitchen(pls r...",4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,322,2020-06-07,4.64,1,276
3,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,3,78,2019-10-13,0.58,1,0
4,5121,BlissArtsSpace!,7356,Garon,Brooklyn,Bedford-Stuyvesant,40.68688,-73.95596,Private room,60,29,50,2019-12-02,0.37,1,365


In [None]:
df_nums = df[['id', 'host_id', 'price', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365']]

In [None]:
# use .describe() method on the df_nums to return basic descriptive characteristics
df_stats = df_nums.describe().T
df_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,49530.0,22959640.0,13526830.0,2060.0,10850500.75,22336021.0,35577790.0,43703359.0
host_id,49530.0,85099510.0,98875970.0,2259.0,9269052.25,38004834.0,137358900.0,349082641.0
price,49530.0,162.6439,419.3123,0.0,68.0,101.0,175.0,10000.0
minimum_nights,49530.0,8.19154,21.97483,1.0,2.0,3.0,6.0,1250.0
number_of_reviews,49530.0,23.86751,48.24582,0.0,1.0,5.0,23.0,746.0
reviews_per_month,38211.0,1.008095,1.345213,0.01,0.15,0.45,1.42,53.8
calculated_host_listings_count,49530.0,6.23303,25.48529,1.0,1.0,1.0,2.0,280.0
availability_365,49530.0,126.6668,142.3814,0.0,0.0,79.0,267.0,365.0


### Calculate the following for each numeric field in the data set and add them to the data frame containing the basic set of descriptive statistics.

- Variance
- Mean Absolute Deviation
- Range
- Interquartile Range

In [None]:
# calculate the variance for the numerical dataframe
df_stats['Variance'] = df_nums.var()

# .mad() method to calculate the mean absolute deviation for df_nums
df_stats['Mean Abs Deviation'] = df_nums.mad()

# calculate the range for df_nums by subtracting the max value from the min value
df_stats['Range'] = (df_nums.max() - df_nums.min())

# calculate the IQR of df_nums by subtracting the 25th percentile from the 75th
df_stats['IQR'] = df_stats['75%'] - df_stats['25%']

In [None]:
df_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,Variance,Mean Abs Deviation,Range,IQR
id,49530.0,22959640.0,13526830.0,2060.0,10850500.75,22336021.0,35577790.0,43703359.0,182975200000000.0,11816040.0,43701300.0,24727290.0
host_id,49530.0,85099510.0,98875970.0,2259.0,9269052.25,38004834.0,137358900.0,349082641.0,9776457000000000.0,81792770.0,349080400.0,128089800.0
price,49530.0,162.6439,419.3123,0.0,68.0,101.0,175.0,10000.0,175822.8,109.0032,10000.0,107.0
minimum_nights,49530.0,8.19154,21.97483,1.0,2.0,3.0,6.0,1250.0,482.8933,8.992644,1249.0,4.0
number_of_reviews,49530.0,23.86751,48.24582,0.0,1.0,5.0,23.0,746.0,2327.659,29.06928,746.0,22.0
reviews_per_month,38211.0,1.008095,1.345213,0.01,0.15,0.45,1.42,53.8,1.809598,0.9489338,53.79,1.27
calculated_host_listings_count,49530.0,6.23303,25.48529,1.0,1.0,1.0,2.0,280.0,649.5002,8.506215,279.0,1.0
availability_365,49530.0,126.6668,142.3814,0.0,0.0,79.0,267.0,365.0,20272.47,126.5533,365.0,267.0


### Generate a basic set of descriptive statistics for the categorical fields in the data set.

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49530 entries, 0 to 49529
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              49530 non-null  int64  
 1   name                            49512 non-null  object 
 2   host_id                         49530 non-null  int64  
 3   host_name                       49524 non-null  object 
 4   neighbourhood_group             49530 non-null  object 
 5   neighbourhood                   49530 non-null  object 
 6   latitude                        49530 non-null  float64
 7   longitude                       49530 non-null  float64
 8   room_type                       49530 non-null  object 
 9   price                           49530 non-null  int64  
 10  minimum_nights                  49530 non-null  int64  
 11  number_of_reviews               49530 non-null  int64  
 12  last_review                     

In [None]:
# variable for the df categorical fields
df_cats = df[['name', 'host_name', 'neighbourhood_group', 'neighbourhood', 'room_type', 'last_review']]

In [None]:
# create a variable for the category df stats
cat_stats = df_cats.describe().T

### Calculate the Pearson correlation coefficients between price, minimum_nights, reviews_per_month, calculated_host_listings_count and availability_365

In [None]:
# correlation coeficients between categories
df[['price', 'minimum_nights', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365']].corr()

Unnamed: 0,price,minimum_nights,reviews_per_month,calculated_host_listings_count,availability_365
price,1.0,0.015923,-0.011623,0.00473,0.055788
minimum_nights,0.015923,1.0,-0.098409,0.165421,0.148435
reviews_per_month,-0.011623,-0.098409,1.0,-0.066725,0.213093
calculated_host_listings_count,0.00473,0.165421,-0.066725,1.0,0.173181
availability_365,0.055788,0.148435,0.213093,0.173181,1.0


### Group the data by neighbourhood and calculate the mean and standard deviation of price for each neighbourhood.

In [None]:
# variable for the df grouped by neighborhood
df_neighborhoods = df.groupby('neighbourhood')

In [None]:
# price mean by neighborhood
df_neighborhoods['price'].mean()

neighbourhood
Allerton             94.500000
Arden Heights       102.600000
Arrochar            104.315789
Arverne             187.125000
Astoria             104.109865
                      ...     
Windsor Terrace     138.067114
Woodhaven            69.638095
Woodlawn             61.700000
Woodrow            2350.000000
Woodside             69.383721
Name: price, Length: 222, dtype: float64

In [None]:
# price standard deviation by neighborhood
df_neighborhoods['price'].std()

neighbourhood
Allerton             83.219379
Arden Heights        54.834296
Arrochar            136.241229
Arverne             231.710658
Astoria             124.606979
                      ...     
Windsor Terrace      89.830596
Woodhaven            43.294191
Woodlawn             24.882167
Woodrow            2333.452378
Woodside             85.328235
Name: price, Length: 222, dtype: float64

### Create a new mimimum revenue index variable that combines the price, minimum nights, calculate host listings count into a single value for each dwelling. Then sort the data frame by that value to see which dwelling has the most financial strength. 



In [None]:
# created new column for mri by adding columns
df['mri'] = df['price']+df['minimum_nights']+df['calculated_host_listings_count']

In [None]:
# sorted the df with the mri column by mri descending
mri_sort = df.sort_values('mri', ascending=False)

In [None]:
# selected the host_id, host_name, and mri columns to view the highest mri
mri_sort.head(1)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,mri
10825,9528920,"Quiet, Clean, Lit @ LES & Chinatown",3906464,Amy,Manhattan,Lower East Side,40.71355,-73.98507,Private room,9999,99,6,2016-01-01,0.11,1,83,10099


### Group the data by neighbourhood_group and calculate the average minimum revenue index for each neighbourhood_group.

In [None]:
df.groupby(['neighbourhood_group'])['mri'].mean()

neighbourhood_group
Bronx             97.798831
Brooklyn         134.571522
Manhattan        238.448982
Queens           113.417765
Staten Island    123.291892
Name: mri, dtype: float64