# Data Investigation with Pandas Assignment

## 1.0 Importing our Libraries

In [1]:
# Importing the libraries we will need

# Importing the pandas library
#
import pandas as pd

# Importing the numpy library
#
import numpy as np

## 1.1 Reading the Dataset from our CSV file





The dataset we will use contains information about the 2009 Census - The Rural Urban Population by Age, Sex and District.






In [2]:
# Let's read the data from the CSV file and create the dataframe to be used
#
census = pd.read_csv('Rural_Urban_Population_By_Age_Sex_and_by_District__2009.csv')
census.columns

Index(['District', 'Urban/Rural', 'Age_years', 'Male', 'Female', 'Total',
       'County', 'Province', 'OBJECTID'],
      dtype='object')

In [3]:
# Check the number of rows and columns available
census.shape

(25420, 9)

## 1.2 Previewing our Dataset


In [4]:
# Let's preview the first 10 rows of our data
#
census.head(10)

Unnamed: 0,District,Urban/Rural,Age_years,Male,Female,Total,County,Province,OBJECTID
0,NAIROBI WEST,Urban,0,10126,10116,20242,Nairobi,NAIROBI,0
1,NAIROBI WEST,Urban,1,8124,7989,16113,Nairobi,NAIROBI,1
2,NAIROBI WEST,Urban,2,8462,8627,17089,Nairobi,NAIROBI,2
3,NAIROBI WEST,Urban,3,8334,8420,16754,Nairobi,NAIROBI,3
4,NAIROBI WEST,Urban,4,7948,7820,15768,Nairobi,NAIROBI,4
5,NAIROBI WEST,Urban,5,7547,7660,15207,Nairobi,NAIROBI,5
6,NAIROBI WEST,Urban,6,6966,7082,14048,Nairobi,NAIROBI,6
7,NAIROBI WEST,Urban,7,6511,6674,13185,Nairobi,NAIROBI,7
8,NAIROBI WEST,Urban,8,6171,6489,12660,Nairobi,NAIROBI,8
9,NAIROBI WEST,Urban,9,6361,6662,13023,Nairobi,NAIROBI,9


In [5]:
# Let's preview the last 10 rows of our data
census.tail(10)

Unnamed: 0,District,Urban/Rural,Age_years,Male,Female,Total,County,Province,OBJECTID
25410,TESO SOUTH,Rural,72,85,96,181,Busia,WESTERN,25410
25411,TESO SOUTH,Rural,73,93,75,168,Busia,WESTERN,25411
25412,TESO SOUTH,Rural,74,91,124,215,Busia,WESTERN,25412
25413,TESO SOUTH,Rural,75,72,108,180,Busia,WESTERN,25413
25414,TESO SOUTH,Rural,76,66,59,125,Busia,WESTERN,25414
25415,TESO SOUTH,Rural,77,99,86,185,Busia,WESTERN,25415
25416,TESO SOUTH,Rural,78,63,47,110,Busia,WESTERN,25416
25417,TESO SOUTH,Rural,79,90,121,211,Busia,WESTERN,25417
25418,TESO SOUTH,Rural,80+,384,391,775,Busia,WESTERN,25418
25419,TESO SOUTH,Rural,Age NS,25,23,48,Busia,WESTERN,25419


## 1.3 Accessing Information about our Dataset

In [6]:
# We can getting to know more about the dataset by accessing its information
#
census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25420 entries, 0 to 25419
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   District     25420 non-null  object
 1   Urban/Rural  25420 non-null  object
 2   Age_years    25420 non-null  object
 3   Male         25420 non-null  int64 
 4   Female       25420 non-null  int64 
 5   Total        25420 non-null  int64 
 6   County       25420 non-null  object
 7   Province     25420 non-null  object
 8   OBJECTID     25420 non-null  int64 
dtypes: int64(4), object(5)
memory usage: 1.7+ MB


In [7]:
# check for missing values
census.isna().sum()

District       0
Urban/Rural    0
Age_years      0
Male           0
Female         0
Total          0
County         0
Province       0
OBJECTID       0
dtype: int64

In [8]:
census.duplicated().sum()

0

## 1.4 Answering Questions

Let's answer the following questions with our dataset

In [9]:
#Challenge 1
# What was the average age of a person of Kenya?

# When we did census.info(), We can see that the Dtype for Age column is an object. We expect it to be an integer

In [10]:
# Let us check for the number of unique values in Age_years column
census['Age_years'].nunique()

82

In [11]:
# Let us view the unique values
# From this we can see 2 values that are not integers - '80+' and 'Age NS'
census['Age_years'].unique()

array(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23',
       '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34',
       '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45',
       '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56',
       '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67',
       '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78',
       '79', '80+', 'Age NS'], dtype=object)

In [12]:
# Let us see how many rows contain 'Age NS'
AgeNS = census[census['Age_years'] == 'Age NS']
AgeNS.shape

(310, 9)

In [13]:
# Let us see how many rows contain '80+'
Age80plus = census[census['Age_years'] == '80+']
Age80plus.shape

(310, 9)

In [14]:
# Let us select only the values that can be converted to integers
AgeAvailable = census.loc[(census['Age_years'] != 'Age NS') & (census['Age_years'] != '80+') ]
AgeAvailable.shape

(24800, 9)

In [15]:
# convert Age_years column to integer
AgeAvailable['Age_years'] = pd.to_numeric(AgeAvailable['Age_years'])
AgeAvailable.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24800 entries, 0 to 25417
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   District     24800 non-null  object
 1   Urban/Rural  24800 non-null  object
 2   Age_years    24800 non-null  int64 
 3   Male         24800 non-null  int64 
 4   Female       24800 non-null  int64 
 5   Total        24800 non-null  int64 
 6   County       24800 non-null  object
 7   Province     24800 non-null  object
 8   OBJECTID     24800 non-null  int64 
dtypes: int64(5), object(4)
memory usage: 1.9+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  AgeAvailable['Age_years'] = pd.to_numeric(AgeAvailable['Age_years'])


In [16]:
# Challenge 1
# What was the average age of a person of Kenya?
AgeAvailable['Age_years'].mean()

39.5

In [17]:
AgeAvailable.describe()

Unnamed: 0,Age_years,Male,Female,Total,OBJECTID
count,24800.0,24800.0,24800.0,24800.0,24800.0
mean,39.5,767.010282,773.526411,1540.536694,12708.49871
std,23.092672,1317.978824,1305.985901,2611.639652,7338.268592
min,0.0,0.0,0.0,0.0,0.0
25%,19.75,60.0,57.0,119.0,6353.75
50%,39.5,276.0,288.0,569.0,12708.5
75%,59.25,908.0,962.0,1876.0,19063.25
max,79.0,23060.0,23090.0,44473.0,25417.0


In [32]:
# Challenge 2
# What was the average female population in Kenya?
#
# The Answer goes here 

In [19]:
# Challenge 3
# What was the average male population in Kenya?
#


In [20]:
# Challenge 4
# Which 5 districts were the most densely populated Kenya in descending order?
#


In [21]:
# Challenge 5
# Which 5 counties were the most densely populated Counties in descending order?
#


In [22]:
# Challenge 6
# Which 5 districts were the most sparsely populated in descending order?
#


In [23]:
# Challenge 7
# What was the average population of the the most densely populated county in Kenya?
#


In [24]:
# Challenge 8
# What was the average population of the the most sparsely populated county in Kenya?
#


In [25]:
# Challenge 9
# What was the total population of Busia County?
#


In [26]:
# Challenge 10
# What was the average age of person in Busia County? How many people of this age were in Busia County?
#


In [27]:
# Challenge 11
# What was the Total population of Teso South District?
#


In [28]:
# Challenge 12
# What was the urban population of Teso South District?
#


In [29]:
# Challenge 13
# What was the rural population of Teso South District?
#


In [30]:
# Challenge 14
# What was the population of people aged 60 and above in Kenya?
#


In [31]:
# Challenge 15
# What was the urban population in Kenya?
#
