# Understanding Descriptive Statistics

Import the necessary libraries here:

In [None]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Challenge 1
#### 1.- Define a function that simulates rolling a dice 10 times. Save the information in a dataframe.
**Hint**: you can use the *choices* function from module *random* to help you with the simulation.

In [None]:
# Roll a dice 10 times

rolls_df = pd.DataFrame(random.choices([1, 2, 3, 4, 5, 6], k = 10), columns = ['Value'])
rolls_df.index += 1

rolls_df

#### 2.- Plot the results sorted by value.

In [None]:
# Plot dice-roll results (histogram)

plt.hist(rolls_df, bins = np.arange(1, 8) - 0.5)
plt.locator_params(axis = 'y', integer = True)
plt.show()

#### 3.- Calculate the frequency distribution and plot it. What is the relation between this plot and the plot above? Describe it with words.

In [None]:
# New list (roll frequency distribution)
freq_rolls = [list(rolls_df['Value']).count(roll) for roll in [1, 2, 3, 4, 5, 6]]

print('Frequency list:', freq_rolls)

# Frequency DataFrame
freq_df = pd.DataFrame({'Frequency': freq_rolls})
freq_df.index += 1

display(freq_df)

# Plot frequency DataFrame (bar plot)
plt.bar(freq_df.index, freq_df['Frequency'])
plt.locator_params(axis = 'y', integer = True)
plt.show()

In [None]:
# ANSWER:
# Both plots depict the same information.
# Determining the frequency distribution, which we did in step 3, is done by the histogram in step 2 all by itself.

## Challenge 2
Now, using the dice results obtained in *challenge 1*, you are going to define some functions that will help you calculate the mean of your data in two different ways, the median and the four quartiles. 

#### 1.- Define a function that computes the mean by summing all the observations and dividing by the total number of observations. You are not allowed to use any methods or functions that directly calculate the mean value. 

In [None]:
# Calculate the mean
def get_mean(values):
   return sum(values) / len(values)

print(f"Dice-rolls mean: {get_mean(rolls_df['Value'])}")

#### 2.- First, calculate the frequency distribution. Then, calculate the mean using the values of the frequency distribution you've just computed. You are not allowed to use any methods or functions that directly calculate the mean value. 

In [None]:
# Roll frequency distribution list
freq_rolls = [list(rolls_df['Value']).count(roll) for roll in [1, 2, 3, 4, 5, 6]]

# Calculate the mean
mean_from_freq = sum([(roll[0] + 1) * roll[1] for roll in enumerate(freq_rolls)]) / sum(freq_rolls)

print(f"Dice-rolls mean: {mean_from_freq}")

#### 3.- Define a function to calculate the median. You are not allowed to use any methods or functions that directly calculate the median value. 
**Hint**: you might need to define two computation cases depending on the number of observations used to calculate the median.

In [None]:
# Random observations
even_observation = random.choices(range(1, 10), k = 10)
odd_observation = random.choices(range(1, 10), k = 11)

# Calculate the median
def get_median(values):
   if (len(values) % 2):
      return sorted(values)[int(len(values) / 2 - 0.5)]
   else:
      index = int(len(values) / 2 - 1)
      return sum(sorted(values)[slice(index, index + 2)]) / 2

print('Even observation', sorted(even_observation), 'Median:', get_median(even_observation))
print('Odd observation', sorted(odd_observation), 'Median:', get_median(odd_observation))

#### 4.- Define a function to calculate the four quartiles. You can use the function you defined above to compute the median but you are not allowed to use any methods or functions that directly calculate the quartiles. 

In [None]:
# Random observations
even_observation = random.choices(range(1, 10), k = 10)
odd_observation = random.choices(range(1, 10), k = 11)

# Calculate quartiles
def get_quartiles(values):
   values = sorted(values)
   center = int(len(values) / 2)
   odd = 1 if len(values) % 2 else 0
   q1 = get_median(values[:center + odd])
   q2 = get_median(values)
   q3 = get_median(values[center:])
   return (q1, q2, q3, max(values))

print('Even observation:', sorted(even_observation), 'Quartiles:', get_quartiles(even_observation))
print('Odd observation:', sorted(odd_observation), 'Quartiles:', get_quartiles(odd_observation))

## Challenge 3
Read the csv `roll_the_dice_hundred.csv` from the `data` folder.
#### 1.- Sort the values and plot them. What do you see?

In [None]:
# Create dataframe
rolls_hundreds = pd.read_csv('../data/roll_the_dice_hundred.csv')

# Data cleaning
rolls_hundreds.drop('Unnamed: 0', axis = 1, inplace = True)
rolls_hundreds.set_index('roll', inplace = True)
rolls_hundreds.index += 1

display(rolls_hundreds.head())

# Plot distribution frequency
plt.hist(rolls_hundreds, bins = np.arange(1, 8) - 0.5)
plt.show()

In [None]:
# ANSWER:
# A lot of people got lucky with their rolls.
# Many 6s, also 4s.

#### 2.- Using the functions you defined in *challenge 2*, calculate the mean value of the hundred dice rolls.

In [None]:
# Calculate mean (hundreds)
mean_hundreds = get_mean(rolls_hundreds['value'])

print('Mean (a hundred rolls): ', mean_hundreds)

#### 3.- Now, calculate the frequency distribution.


In [None]:
# Calculate frequency distribution
freq_dist = rolls_hundreds['value'].value_counts().sort_index()

freq_dist

#### 4.- Plot the histogram. What do you see (shape, values...) ? How can you connect the mean value to the histogram? 

In [None]:
# Plot rolls (hundreds) histogram + mean
plt.hist(rolls_hundreds, bins = np.arange(1, 8) - 0.5)
plt.axvline(x = mean_hundreds, color = 'r')
plt.show()

In [None]:
# ANSWER:
# This time we rolled a hundred times ...
# so it's already more flattened, compared to the 10-rolls data set
# That's why the mean is shifting towards the center value (3.5).

#### 5.- Read the `roll_the_dice_thousand.csv` from the `data` folder. Plot the frequency distribution as you did before. Has anything changed? Why do you think it changed?

In [None]:
# Create dataframe
rolls_thousands = pd.read_csv('../data/roll_the_dice_thousand.csv')

# Data cleaning
rolls_thousands.drop('Unnamed: 0', axis = 1, inplace = True)
rolls_thousands.set_index('roll', inplace = True)
rolls_thousands.index += 1

display(rolls_thousands.head())

# Plot distribution frequency
plt.hist(rolls_thousands, bins = np.arange(1, 8) - 0.5)
plt.show()

# Calculate mean (thousands)
print('Mean (a thousand rolls): ', get_mean(rolls_thousands['value']))

In [None]:
# ANSWER:
# This time we rolled a THOUSAND times ...
# so the results are almost equally distributed
# The mean is almost at the center (3.5).

## Challenge 4
In the `data` folder of this repository you will find three different files with the prefix `ages_population`. These files contain information about a poll answered by a thousand people regarding their age. Each file corresponds to the poll answers in different neighbourhoods of Barcelona.

#### 1.- Read the file `ages_population.csv`. Calculate the frequency distribution and plot it as we did during the lesson. Try to guess the range in which the mean and the standard deviation will be by looking at the plot. 

In [None]:
# Create + clean dataframe
ages_pop_1 = pd.read_csv('../data/ages_population.csv')
ages_pop_1 = ages_pop_1.astype({'observation': 'int'})

# Calculate age distribution frequency
ages_1_freq = ages_pop_1.value_counts().to_frame().reset_index()
ages_1_freq.rename(columns = {0: 'count'}, inplace= True)
ages_1_freq.index += 1

display(ages_1_freq.head())

# Plot age distribution histogram
plt.hist(ages_pop_1['observation'], bins = max(ages_pop_1.observation))
plt.show()

In [None]:
# ANSWER:
# It's a normal frequency distribution, so...
# looking at the plot I'd guess: MEAN => ~39 years | STD => ~10 years

#### 2.- Calculate the exact mean and standard deviation and compare them with your guesses. Do they fall inside the ranges you guessed?

In [None]:
# Calculate mean + standard deviation
mean_pop_1 = np.mean(ages_pop_1['observation'])
std_pop_1 = np.std(ages_pop_1['observation'])

print('Mean:', mean_pop_1)
print('Standard deviation:', std_pop_1)

In [None]:
# ANSWER:
# 39 (mean), 10 (std) ... I was kinda close.

#### 3.- Now read the file `ages_population2.csv` . Calculate the frequency distribution and plot it.

In [None]:
# Create + clean dataframe
ages_pop_2 = pd.read_csv('../data/ages_population2.csv')
ages_pop_2 = ages_pop_2.astype({'observation': 'int'})

# Calculate age distribution frequency
ages_2_freq = ages_pop_2.value_counts().to_frame().reset_index()
ages_2_freq.rename(columns = {0: 'count'}, inplace= True)
ages_2_freq.index += 1

display(ages_2_freq.head())

# Plot age distribution histogram
plt.hist(ages_pop_2['observation'], bins = len(ages_2_freq))
plt.show()

####  4.- What do you see? Is there any difference with the frequency distribution in step 1?

In [None]:
# ANSWER:
# It is a much younger population.
# The frequency distribution is normal.

#### 5.- Calculate the mean and standard deviation. Compare the results with the mean and standard deviation in step 2. What do you think?

In [None]:
# Calculate mean + standard deviation
mean_pop_2 = np.mean(ages_pop_2['observation'])
std_pop_2 = np.std(ages_pop_2['observation'])

print('Mean:', mean_pop_2)
print('Standard deviation:', std_pop_2)

In [None]:
# ANSWER:
# The results for mean and standard deviation are to be expected.
# The mean shifts to a younger age because the whole population sample is younger.
# The standard deviation is much smaller, because the span of the group is much narrower.

## Challenge 5
Now is the turn of `ages_population3.csv`.

#### 1.- Read the file `ages_population3.csv`. Calculate the frequency distribution and plot it.

In [None]:
# Create + clean dataframe
ages_pop_3 = pd.read_csv('../data/ages_population3.csv')
ages_pop_3 = ages_pop_3.astype({'observation': 'int'})

# Calculate age distribution frequency
ages_3_freq = ages_pop_3.value_counts().to_frame().reset_index()
ages_3_freq.rename(columns = {0: 'count'}, inplace= True)
ages_3_freq.index += 1

display(ages_3_freq.head())

# Plot age distribution histogram
plt.hist(ages_pop_3['observation'], bins = max(ages_3_freq.observation))
plt.show()

#### 2.- Calculate the mean and standard deviation. Compare the results with the plot in step 1. What is happening?

In [None]:
# Calculate mean + standard deviation
mean_pop_3 = np.mean(ages_pop_3['observation'])
std_pop_3 = np.std(ages_pop_3['observation'])

print('Mean:', mean_pop_3)
print('Standard deviation:', std_pop_3)

In [None]:
# ANSWER:
# In this population sample there is a significant accumulation of older people.
# For the mean, this means that it continues to shift to an older age.
# The standard deviation increases because the concentration among older people means a massive addition of regret.

#### 3.- Calculate the four quartiles. Use the results to explain your reasoning for question in step 2. How much of a difference is there between the median and the mean?

In [None]:
print('Quartile 1:\t', np.quantile(ages_pop_3['observation'], 0.25))
print('Quartile 2:\t', np.quantile(ages_pop_3['observation'], 0.50))
print('Quartile 3:\t', np.quantile(ages_pop_3['observation'], 0.75))
print('Quartile 4:\t', np.quantile(ages_pop_3['observation'], 1.00))
print('Mean:\t\t', np.mean(ages_pop_3['observation']))
print('Median:\t\t', np.median(ages_pop_3))

In [None]:
# ANSWER:
# The difference between the mean and the median is almost 2 year - due to the accumulation of older people.
# The most extensive collection of people is in the second 25%, as shown by the span of only 10 years (q2 - q1).
# But we can also see the accumulation in older people by comparing the 4th 25% (24 years) to the 1st 25% (30 years).

#### 4.- Calculate other percentiles that might be useful to give more arguments to your reasoning.

In [None]:
print('Quantile 0.15:\t', np.quantile(ages_pop_3['observation'], 0.15))
print('Quantile 0.85:\t', np.quantile(ages_pop_3['observation'], 0.85))
print('Quantile 1.00:\t', np.quantile(ages_pop_3['observation'], 1.00))

In [None]:
# ANSWER:
# I would use the 0.15, 0.85 and 1.00 quantiles.
# There are hardly any observations for the first 15% (25-year span).
# In the last 15% there is an accumulation (only 13-year span).

## Bonus challenge
Compare the information about the three neighbourhoods. Prepare a report about the three of them. Remember to find out which are their similarities and their differences backing your arguments in basic statistics.

# **Neighbourhoods Report**

In [None]:
print('Population sample 1')
plt.hist(ages_pop_1['observation'], bins = max(ages_pop_1.observation))
plt.show()

print('Population sample 2')
plt.hist(ages_pop_2['observation'], bins = len(ages_2_freq))
plt.show()

print('Population sample 3')
plt.hist(ages_pop_3['observation'], bins = max(ages_3_freq.observation))
plt.show()

## Assumptions
1. The sample sizes of Population 1, Population 2, and Population 3 are the same.

In [None]:
print(f'''
   Population 1: {len(ages_pop_1)} observations
   Population 2: {len(ages_pop_2)} observations
   Population 3: {len(ages_pop_3)} observations''')

2. Population 1 and Population 2 are normally distributed. This can be shown by the relative closeness between the mean and the median.

In [None]:
# Calculate means population 1 + population 2
med_pop_1 = np.median(ages_pop_1['observation'])
med_pop_2 = np.median(ages_pop_2['observation'])

print(f'Population 1:\nMean: {mean_pop_1}\tMedian: {med_pop_1}\tDifference: {mean_pop_1 - med_pop_1}')
print(f'Population 2:\nMean: {mean_pop_2}\tMedian: {med_pop_2}\tDifference: {mean_pop_2 - med_pop_2}')

3. There is a significant aggregation of older people in population 3.

In [None]:
# Select older people (above 63 years of age)
seniors_acc_pop_3 = ages_pop_3[(ages_pop_3['observation'] > 63) & (ages_pop_3['observation'] < 100)].value_counts()

print(f'''Population 3:
{sum(seniors_acc_pop_3)} people above 63 years (out of {len(ages_pop_1)}).
Age range of the data between 85 - 95%: only {int(np.quantile(ages_pop_3['observation'], 0.95) - np.quantile(ages_pop_3['observation'], 0.85))} years
This is also reflected in the difference between the mean and median: {mean_pop_3 - np.median(ages_pop_3['observation'])} years''')

4. When speaking of an accumulation of older people in Population 3, one can also say there is a small gap around 60 years.<br>In any case, the transition to old age does not seem to be distributed homogeneously there.

In [None]:
# Select age gap (53 - 63 years of age)
pop_3_gap = ages_pop_3[(ages_pop_3['observation'] > 53) & (ages_pop_3['observation'] < 63)].value_counts()

print(f'''Population 3:
Only {sum(pop_3_gap)} people between 54 and 62.
Out of 1000 observations these are only {sum(pop_3_gap) * 100 / 1000}%.''')

5. Population 2 is much more younger than Population 1 and Population 3. It's easy to see by the age range of the first 75%.

In [None]:
print(f'''Age-ranges first 75%:
Population 1:\t0 - {int(np.quantile(ages_pop_1['observation'], 0.75))} years
Population 2:\t0 - {int(np.quantile(ages_pop_2['observation'], 0.75))} years
Population 3:\t0 - {int(np.quantile(ages_pop_3['observation'], 0.75))} years''')

6. Population 2 consists mostly of young adults within a very narrow age range. This can be expressed with the interquartile range.

In [None]:
# Calculate interquartile range (q3 - q1)
q1_pop_2 = np.quantile(ages_pop_2['observation'], 0.25)
q3_pop_2 = np.quantile(ages_pop_2['observation'], 0.75)

print(f'''Population 2:
Interquartile Range: {int(q3_pop_2 - q1_pop_2)} years ({int(q1_pop_2)} - {int(q3_pop_2)} years)''')

7. In Population 1, there is a single outlier at 82 years.

In [None]:
# Identify outlier among seniors above 70 years of age
ages_pop_1[ages_pop_1['observation'] > 70]

8. It is striking that in Population 1, there are generally hardly any seniors.

In [None]:
# Select seniors above 60 years of age
pop_1_seniors = ages_pop_1[ages_pop_1['observation'] > 60].value_counts()

print(f'''Population 1:
Total number of seniors (above 60): {sum(pop_1_seniors)} out of {len(ages_pop_1)} people''')

9. In Population 3 there seem to be hardly any children.

In [None]:
# Select population first 0.02% of age
pop_3_children = int(np.quantile(ages_pop_3, 0.02))

print(f'''Population 3:
Only {2}% of the population are under the age of {pop_3_children}.''')