# Descriptive Statistics Assignment

In [None]:
import pandas as pd

data_url = "http://data.insideairbnb.com/united-states/ny/new-york-city/2020-06-08/visualisations/listings.csv"

### Read the aggregated AirBnB listings of New York (using `data_url`) into a Pandas data frame.

In [None]:
data = pd.read_csv(data_url)

In [None]:
data.head(5)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2060,Modern NYC,2259,Jenny,Manhattan,Washington Heights,40.85722,-73.9379,Private room,100,1,1,2008-09-22,0.01,1,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,3,48,2019-11-04,0.37,2,335
2,3831,"Whole flr w/private bdrm, bath & kitchen(pls r...",4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,322,2020-06-07,4.64,1,276
3,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,3,78,2019-10-13,0.58,1,0
4,5121,BlissArtsSpace!,7356,Garon,Brooklyn,Bedford-Stuyvesant,40.68688,-73.95596,Private room,60,29,50,2019-12-02,0.37,1,365


### Print the data type for each column.

In [None]:
data.dtypes

id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

### Generate a basic set of descriptive statistics for the numeric fields in the data set.

In [None]:
desc = data.describe()

### Calculate the following for each numeric field in the data set and add them to the data frame containing the basic set of descriptive statistics.

- Variance
- Mean Absolute Deviation
- Range
- Interquartile Range

In [None]:
data.var()

id                                1.829752e+14
host_id                           9.776457e+15
latitude                          2.989206e-03
longitude                         2.260709e-03
price                             1.758228e+05
minimum_nights                    4.828933e+02
number_of_reviews                 2.327659e+03
reviews_per_month                 1.809598e+00
calculated_host_listings_count    6.495002e+02
availability_365                  2.027247e+04
dtype: float64

In [None]:
data.mad()

id                                1.181604e+07
host_id                           8.179277e+07
latitude                          4.336345e-02
longitude                         3.274414e-02
price                             1.090032e+02
minimum_nights                    8.992644e+00
number_of_reviews                 2.906928e+01
reviews_per_month                 9.489338e-01
calculated_host_listings_count    8.506215e+00
availability_365                  1.265533e+02
dtype: float64

In [None]:
max_min = desc.loc[['max', 'min']]

In [None]:
for col in max_min:
    max_col = max_min[col].loc['max']
    min_col = max_min[col].loc['min']
    print (f'range in {col} = {max_col - min_col}')

range in id = 43701299.0
range in host_id = 349080382.0
range in latitude = 0.4118999999999957
range in longitude = 0.5278500000000008
range in price = 10000.0
range in minimum_nights = 1249.0
range in number_of_reviews = 746.0
range in reviews_per_month = 53.79
range in calculated_host_listings_count = 279.0
range in availability_365 = 365.0


### Generate a basic set of descriptive statistics for the categorical fields in the data set.

In [None]:
data.describe(include="object").T

Unnamed: 0,count,unique,top,freq
name,49512,48247,Home away from home,21
host_name,49524,11572,Michael,416
neighbourhood_group,49530,5,Manhattan,21963
neighbourhood,49530,222,Williamsburg,3752
room_type,49530,4,Entire home/apt,25716
last_review,38211,2077,2020-01-01,940


### Calculate the Pearson correlation coefficients between price, minimum_nights, reviews_per_month, calculated_host_listings_count and availability_365

In [None]:
new_data = data[['price', 'minimum_nights', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365']]
new_data.corr('pearson')

Unnamed: 0,price,minimum_nights,reviews_per_month,calculated_host_listings_count,availability_365
price,1.0,0.015923,-0.011623,0.00473,0.055788
minimum_nights,0.015923,1.0,-0.098409,0.165421,0.148435
reviews_per_month,-0.011623,-0.098409,1.0,-0.066725,0.213093
calculated_host_listings_count,0.00473,0.165421,-0.066725,1.0,0.173181
availability_365,0.055788,0.148435,0.213093,0.173181,1.0


### Group the data by neighbourhood and calculate the mean and standard deviation of revenue for each neighbourhood.

In [None]:
aggs = {'price' : ['mean', 'std']}
new_df = data.groupby(['neighbourhood_group']).agg(aggs).reset_index()
new_df

Unnamed: 0_level_0,neighbourhood_group,price,price
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std
0,Bronx,90.176127,98.569331
1,Brooklyn,125.056194,204.835722
2,Manhattan,218.855166,583.519236
3,Queens,99.745056,195.717338
4,Staten Island,116.908108,241.447121


### Create a new mimimum revenue index variable that combines the price, minimum nights, calculate host listings count into a single value for each dwelling. Then sort the data frame by that value to see which dwelling has the most financial strength. 



In [None]:
data['revenue_index'] = (data['price']) * (data['minimum_nights']) * (data['calculated_host_listings_count'])

data.sort_values('revenue_index', ascending= False).head(5).reset_index() 

Unnamed: 0,index,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,revenue_index
0,14754,13443640,Deluxe 3-BR + 3.5Bath with sweeping city views!!,30283594,Global Luxury Suites,Manhattan,Midtown,40.75143,-73.96973,Entire home/apt,1100,120,0,,,83,184,10956000
1,31916,30389115,"Airy East Village 1BR w/ Doorman, Gym, near NY...",107434423,Blueground,Manhattan,East Village,40.73099,-73.98844,Entire home/apt,336,90,0,,,280,145,8467200
2,9961,8810949,NYC Upscale Midtown East 3BR Apt,30283594,Global Luxury Suites,Manhattan,Midtown,40.74867,-73.96734,Entire home/apt,1170,80,0,,,83,365,7768800
3,33929,32399523,Premium FiDi 1BR w/ Doorman + Wraparound Roofd...,107434423,Blueground,Manhattan,Financial District,40.70504,-74.00812,Entire home/apt,270,90,0,,,280,251,6804000
4,31992,30394518,Opulent FiDi Studio w/ Great Rooftop + Gym by ...,107434423,Blueground,Manhattan,Financial District,40.70525,-74.01615,Entire home/apt,157,120,1,2019-02-15,0.06,280,318,5275200


### Group the data by neighbourhood_group and calculate the average minimum revenue index for each neighbourhood_group.

In [None]:
index_neighborhood = data.groupby(['revenue_index']).sum()
index_neighborhood.sort_values('revenue_index', ascending= False).head(5).reset_index() 

Unnamed: 0,revenue_index,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,10956000,13443640,30283594,40.75143,-73.96973,1100,120,0,0.0,83,184
1,8467200,30389115,107434423,40.73099,-73.98844,336,90,0,0.0,280,145
2,7768800,8810949,30283594,40.74867,-73.96734,1170,80,0,0.0,83,365
3,6804000,32399523,107434423,40.70504,-74.00812,270,90,0,0.0,280,251
4,5275200,30394518,107434423,40.70525,-74.01615,157,120,1,0.06,280,318


1. Identify 3 scenarios in your everyday life where you use probability.

Will our favorite team win the next game?

Depending on what day of the week it is and what time it is, there will be a certain amount of traffic.

What is the chance of rain based on how many weeks per year it rains?

2. Using the formula you learned to calculate probabilities, approximate the probabilities of the events in each of those 3 scenarios.

a) For the question: "Will our favorite team win the next game?"

Our team has won 7 out of 10 games, so their chance of winning is 7/10 or 70%

The other team has won 8/10 so their probability of winning is 80% (or 20% of losing)

P(AB) = P(A|B) * P(B)
A = our team wins
B = the other team loses

P(AB) = game ends in a draw

P(A) = .70
P(B) = .20

Joint probability  = .14 = 14%

this means that there is a 14% chance that the game will end in a draw

b) for the question: 'What is the probability of getting stuck in traffic on the way to work?'

heavy traffic = 5/7 = .71

light traffic = 2/7 = .29

joint probability = .2 = 20 %

c) for the question 'What is the chance of getting rain and thunder?'

chance of rain = 1/52 = 1.9%

chance of thunder is 2/7 = 29%

joint probability = .005 = 0.5%


3. Think about what potential dependencies the events in each of your 3 scenarios have. Calculate the joint probabilities for the events and their dependencies.

a) the dependency for our team to win is if the other team loses

b) the dependency for getting stuck in traffic is if it is a weekday

c) the dependency for rain and thunder is if the moisture level is over 50%

4. Calculate the conditional probabilities for the events given their dependencies.

a) the joint probability is 14% and the dependency is 20%. The conditional probabilty is 70%

b) joint probabilty = 20%. Dependency = 5/7 = 71%.
Conditional probability = 28.5%

c) joint probability = 0.5%, dependency = 3%
conditional probability = 17%