In [1]:
# import important libraries for EDA
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Load data into dataframe
df = pd.read_csv("us-cities-demographics.csv",header=None)

In [3]:
# Expand the semicolon separated data into individual columns
df = df.iloc[:,0].str.split(';', expand=True)
new_header = df.iloc[0]
df.columns = new_header
df = df.iloc[1:]

In [4]:
# Check data types of columns and shape of data frame
print(" Data types:\n",df.dtypes, "\n \nShape of Dataframe: ",df.shape)

 Data types:
 0
City                      object
State                     object
Median Age                object
Male Population           object
Female Population         object
Total Population          object
Number of Veterans        object
Foreign-born              object
Average Household Size    object
State Code                object
Race                      object
Count                     object
dtype: object 
 
Shape of Dataframe:  (2891, 12)


In [5]:
df.head()

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
1,Silver Spring,Maryland,33.8,40601,41862,82463,1562,30908,2.6,MD,Hispanic or Latino,25924
2,Quincy,Massachusetts,41.0,44129,49500,93629,4147,32935,2.39,MA,White,58723
3,Hoover,Alabama,38.5,38040,46799,84839,4819,8229,2.58,AL,Asian,4759
4,Rancho Cucamonga,California,34.5,88127,87105,175232,5821,33878,3.18,CA,Black or African-American,24437
5,Newark,New Jersey,34.6,138040,143873,281913,5829,86253,2.73,NJ,White,76402


In [4]:
#Renaming columns
df.rename(columns={'Median Age':'Median_Age', 'Male Population':'Male_Population', 'Female Population':'Female_Population', 'Total Population':'Total_Population', \
                  'Number of Veterans':'Number_of_Veterans','Foreign-born':'Foreign_born','Average Household Size':'Average_Household_Size','State Code':'State_Code'}, inplace=True)

In [5]:
#Dropping duplicate rows
# There are no duplicate rows in this DataFrame
df[df.duplicated()].size

0

In [6]:
# Dropping/Replacing missing/null/empty values
df.isnull().sum()
df.apply(lambda s: pd.to_numeric(s, errors='coerce').notnull().all())

# There are a few rows which have empty values
df.loc[(df.City == '') | (df.State == '') | (df.Race == '') | (df.State_Code == '') | (df.Median_Age == '') | (df.Male_Population == '') | \
       (df.Female_Population == '') | (df.Total_Population == '') | (df.Number_of_Veterans == '') | \
       (df.Foreign_born == '') | (df.Average_Household_Size == '') | (df.Count == '')]

Unnamed: 0,City,State,Median_Age,Male_Population,Female_Population,Total_Population,Number_of_Veterans,Foreign_born,Average_Household_Size,State_Code,Race,Count
112,San Juan,Puerto Rico,41.4,155408.0,186829.0,342237,,,,PR,Hispanic or Latino,335559
156,Caguas,Puerto Rico,40.4,34743.0,42265.0,77008,,,,PR,Hispanic or Latino,76349
259,Carolina,Puerto Rico,42.0,64758.0,77308.0,142066,,,,PR,American Indian and Alaska Native,12143
334,The Villages,Florida,70.5,,,72590,15231.0,4034.0,,FL,Hispanic or Latino,1066
450,The Villages,Florida,70.5,,,72590,15231.0,4034.0,,FL,Black or African-American,331
638,Carolina,Puerto Rico,42.0,64758.0,77308.0,142066,,,,PR,Hispanic or Latino,139967
1438,The Villages,Florida,70.5,,,72590,15231.0,4034.0,,FL,White,72211
1748,San Juan,Puerto Rico,41.4,155408.0,186829.0,342237,,,,PR,American Indian and Alaska Native,4031
1749,Mayagüez,Puerto Rico,38.1,30799.0,35782.0,66581,,,,PR,Asian,235
1996,Ponce,Puerto Rico,40.5,56968.0,64615.0,121583,,,,PR,Hispanic or Latino,120705


In [8]:
df.loc[df.City == 'Caguas']

Unnamed: 0,City,State,Median_Age,Male_Population,Female_Population,Total_Population,Number_of_Veterans,Foreign_born,Average_Household_Size,State_Code,Race,Count
156,Caguas,Puerto Rico,40.4,34743,42265,77008,,,,PR,Hispanic or Latino,76349
2598,Caguas,Puerto Rico,40.4,34743,42265,77008,,,,PR,American Indian and Alaska Native,624


In [9]:
# Replace empty with 0 in relevant columns
zero_cols = ['Median_Age','Male_Population','Female_Population','Total_Population','Number_of_Veterans','Foreign_born','Average_Household_Size','Count']
df[zero_cols] = df[zero_cols].replace('',0)

In [10]:
# Change datatype of object columns to numerical 
df = df.astype({"Median_Age":float,"Male_Population":int,"Female_Population":int,"Total_Population":int,"Number_of_Veterans":int,"Foreign_born":\
int, "Average_Household_Size":float,"Count":int})

In [11]:
df.dtypes

0
City                       object
State                      object
Median_Age                float64
Male_Population             int64
Female_Population           int64
Total_Population            int64
Number_of_Veterans          int64
Foreign_born                int64
Average_Household_Size    float64
State_Code                 object
Race                       object
Count                       int64
dtype: object

In [10]:
df.head()

Unnamed: 0,City,State,Median_Age,Male_Population,Female_Population,Total_Population,Number_of_Veterans,Foreign_born,Average_Household_Size,State_Code,Race,Count
1,Silver Spring,Maryland,33.8,40601,41862,82463,1562,30908,2.6,MD,Hispanic or Latino,25924
2,Quincy,Massachusetts,41.0,44129,49500,93629,4147,32935,2.39,MA,White,58723
3,Hoover,Alabama,38.5,38040,46799,84839,4819,8229,2.58,AL,Asian,4759
4,Rancho Cucamonga,California,34.5,88127,87105,175232,5821,33878,3.18,CA,Black or African-American,24437
5,Newark,New Jersey,34.6,138040,143873,281913,5829,86253,2.73,NJ,White,76402


In [13]:
df.loc[df.City == 'Dallas']

Unnamed: 0,City,State,Median_Age,Male_Population,Female_Population,Total_Population,Number_of_Veterans,Foreign_born,Average_Household_Size,State_Code,Race,Count
539,Dallas,Texas,32.6,639019,661063,1300082,41026,326825,2.59,TX,American Indian and Alaska Native,17510
1288,Dallas,Texas,32.6,639019,661063,1300082,41026,326825,2.59,TX,Black or African-American,322570
1812,Dallas,Texas,32.6,639019,661063,1300082,41026,326825,2.59,TX,White,839169
1902,Dallas,Texas,32.6,639019,661063,1300082,41026,326825,2.59,TX,Hispanic or Latino,549966
1903,Dallas,Texas,32.6,639019,661063,1300082,41026,326825,2.59,TX,Asian,47099


In [13]:
# Check data types of columns and shape of data frame
print(" Data types:\n",df.dtypes, "\n \nShape of Dataframe: ",df.shape)

 Data types:
 0
City                       object
State                      object
Median_Age                float64
Male_Population             int64
Female_Population           int64
Total_Population            int64
Number_of_Veterans          int64
Foreign_born                int64
Average_Household_Size    float64
State_Code                 object
Race                       object
Count                       int64
dtype: object 
 
Shape of Dataframe:  (2891, 12)


In [20]:
# We will use spark to pivot this table as shown below:
df.pivot_table(index=['State_Code','City'], columns='Race', values='Count')

Unnamed: 0_level_0,Race,American Indian and Alaska Native,Asian,Black or African-American,Hispanic or Latino,White
State_Code,City,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AK,Anchorage,36339.0,36825.0,23107.0,27261.0,212696.0
AL,Birmingham,1319.0,1500.0,157985.0,8940.0,51728.0
AL,Dothan,656.0,1175.0,23243.0,1704.0,43516.0
AL,Hoover,,4759.0,18191.0,3430.0,61869.0
AL,Huntsville,1755.0,6566.0,61561.0,10887.0,121904.0
AL,Mobile,2816.0,5518.0,96397.0,5229.0,93755.0
AL,Montgomery,1277.0,6518.0,121360.0,6648.0,73545.0
AL,Tuscaloosa,261.0,2733.0,42331.0,2475.0,52603.0
AR,Fayetteville,2058.0,4707.0,6927.0,5535.0,68830.0
AR,Fort Smith,2993.0,6228.0,9851.0,17104.0,66004.0


In [14]:
df.to_csv('us_cities_demo_final.csv', index=False)