# Assignment 5 - Populations
Author: Vanessa Lyra

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

### Part 1
Write a jupyter notebook that analyses the differences between the sexes by age in Ireland.

Weighted mean age (by sex)
The difference between the sexes by age

In [None]:
# Fetching data
url = "https://ws.cso.ie/public/api.restful/PxStat.Data.Cube_API.ReadDataset/FY006A/CSV/1.0/en"
df = pd.read_csv(url)

# Checking and printing data columns
headers = df.columns.tolist()
headers


['STATISTIC',
 'Statistic Label',
 'TLIST(A1)',
 'CensusYear',
 'C02199V02655',
 'Sex',
 'C02076V03371',
 'Single Year of Age',
 'C03789V04537',
 'Administrative Counties',
 'UNIT',
 'VALUE']

In [3]:
# Cleansing data
# Drop all unwanted colunms
drop_columns = ["STATISTIC", "Statistic Label","TLIST(A1)", "CensusYear", "C02199V02655", "C02076V03371", "C03789V04537", "Administrative Counties", "UNIT"]
df.drop(columns=drop_columns, inplace=True, errors='ignore')

# Replacing unwanted characters
df = df[df["Single Year of Age"] != "All ages"]
df["Single Year of Age"] = df["Single Year of Age"].replace("Under 1 year", "0")
df["Single Year of Age"] = df["Single Year of Age"].replace(r"\D", "", regex=True)
df["Single Year of Age"] = df["Single Year of Age"].astype('int64')

# Ensuring only numeric values
df['VALUE']=df['VALUE'].astype('int64')

# Pivot data
df_anal = pd.pivot_table(df,"VALUE","Single Year of Age","Sex")

# Python couldn't find Single Year of Age, line addedd to transform it in a regular column again
df_anal = df_anal.reset_index()

# Weighted means
mean_female = np.average(df_anal["Single Year of Age"], weights=df_anal["Female"])
mean_male = np.average(df_anal["Single Year of Age"], weights=df_anal["Male"])

# Printing results
print("Weighted mean age (by sex)")
print(f"Weighted mean females: {mean_female:.2f}")
print(f"Weighted mean males: {mean_male:.2f}")
print(f"Difference (Females & Males): {mean_female - mean_male:.2f}\n")

# Difference between the sexes by age
df_anal["Difference (Female - Male)"] = df_anal["Female"] - df_anal["Male"]
print(df_anal[["Single Year of Age", "Difference (Female - Male)"]].head())


Weighted mean age (by sex)
Weighted mean females: 38.94
Weighted mean males: 37.74
Difference (Females & Males): 1.20

Sex  Single Year of Age  Difference (Female - Male)
0                     0                    -89.0000
1                     1                    -83.1250
2                     2                    -78.8750
3                     3                    -94.8750
4                     4                   -116.6875


https://www.geeksforgeeks.org/pandas/python-pandas-dataframe-reset_index/ Reset index


### Part 2
In the same notebook, make a variable that stores an age (say 35).

Write that code that would group the people within 5 years of that age together, into one age group 

Calculate the population difference between the sexes in that age group.

In [4]:
# Defining age for analysis
base_age = 35

# Defining variables fro ages group
younger_group = base_age - 5
older_group = base_age + 5

# Age group print statement
print(f"Age group of study: {younger_group} - {older_group}")

# Finding people at defined age group in datafrme
age_group = df_anal[
    (df_anal["Single Year of Age"] >= younger_group) &
    (df_anal["Single Year of Age"] <= older_group)]

# Separating age group by sex
female_group = age_group["Female"].sum()
male_group = age_group["Male"].sum()

# Calculation age difference between sexes in age group
sexes_diff = female_group - male_group

# Printing results to user 
print(f"Female group:{female_group}")
print(f"Male group:{male_group}")
print(f"Sexes difference : {sexes_diff}\n")


Age group of study: 30 - 40
Female group:25906.625
Male group:24001.875
Sexes difference : 1904.75



https://stackoverflow.com/questions/38884466/how-to-select-a-range-of-values-in-a-pandas-dataframe-column retireving range of data from df inspiration
https://www.statology.org/pandas-groupby-range/ .sum in df inspiration

### Part 3
Write the code that would work out which region in Ireland has the biggest population difference between the sexes in that age group

In [None]:
# Fetching data
url = "https://ws.cso.ie/public/api.restful/PxStat.Data.Cube_API.ReadDataset/FY006A/CSV/1.0/en"
df = pd.read_csv(url)

# Cleansing data
# Drop all unwanted colunms
drop_columns = ["STATISTIC", "Statistic Label","TLIST(A1)", "CensusYear", "C02199V02655", "C02076V03371", "C03789V04537", "UNIT"]
df.drop(columns=drop_columns, inplace=True, errors='ignore')

# Removing Ireland from Administrative counties data
df = df[df["Administrative Counties"] != "Ireland"]

# Replacing unwanted characters
df = df[df["Single Year of Age"] != "All ages"]
df["Single Year of Age"] = df["Single Year of Age"].replace("Under 1 year", "0")
df["Single Year of Age"] = df["Single Year of Age"].replace(r"\D", "", regex=True)
df["Single Year of Age"] = df["Single Year of Age"].astype('int64')

# Ensuring only numeric values
df['VALUE']=df['VALUE'].astype('int64')

# Pivoting data
# Reseting index to transform "Single Year of Age", "Administrative Counties" from index into a regular column for analysis
df_anal2 = pd.pivot_table(df, values="VALUE", index=["Single Year of Age","Administrative Counties"], columns="Sex", aggfunc="sum").reset_index()

# Defining age for analysis
base_age = 35

# Defining variables fro ages group
younger_group = base_age - 5
older_group = base_age + 5

# Finding people at defined age group in datafrme
age_group = df_anal2[
    (df_anal2["Single Year of Age"] >= younger_group) &
    (df_anal2["Single Year of Age"] <= older_group)]


#Grouping age group by regions and sex
irl_region = age_group.groupby("Administrative Counties").agg(
    female_group=("Female", "sum"),
    male_group=("Male", "sum"))

#Calculate difference by sex
irl_region["sex_difference"] = irl_region["female_group"] - irl_region["male_group"]

#Finding region with highest age difference
region_diff = irl_region["sex_difference"].idxmax()
diff_value = irl_region.loc[region_diff, "sex_difference"]


#Printing statements to user
print(f"\n Population difference between sexes for ages {younger_group} - {older_group}:\n")
print("\n Region with the largest sex difference:")
print(f"{region_diff} and difference is: {diff_value}")



 Population difference between sexes for ages 30 - 40:


 Region with the largest sex difference:
Fingal County Council and difference is: 2942
                                       female_group  male_group  \
Administrative Counties                                           
Carlow County Council                          4774        4451   
Cavan County Council                           6150        5776   
Clare County Council                           8896        8085   
Cork City Council                             19750       18812   
Cork County Council                           26545       23706   
Donegal County Council                        11700       10621   
Dublin City Council                           59831       60867   
Dún Laoghaire Rathdown County Council         18450       17074   
Fingal County Council                         29092       26150   
Galway City Council                            7650        7156   
Galway County Council                         13904

https://www.statology.org/pandas-filter-by-column-value-not-equal/ #Ignore Ireland from dataframe
https://medium.com/@heyamit10/understanding-groupby-and-aggregate-in-pandas-f45e524538b9 #Groupby and aggregate
https://www.geeksforgeeks.org/python/
https://community.dataquest.io/t/pandas-return-row-with-the-maximum-value-of-a-column/258474 #idxmax
https://saturncloud.io/blog/how-to-search-pandas-data-frame-by-index-value-and-value-in-any-column/ #.loc