In [1]:
# import and installs
import pandas as pd

from pyspark.sql import SparkSession
spark = SparkSession.builder.\
config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
.getOrCreate()

from pyspark.sql.functions import first
from pyspark.sql.functions import upper, col
from pyspark.sql.types import StructField, StructType, StringType, LongType, IntegerType
from pyspark.sql.functions import udf, date_format
import datetime as dt

In [None]:
# Explore and assess the data

## Explore data
# - Identify data quality issues (missing values, duplicate data etc.)


In [2]:
# Read us-cities-demographics.csv
us_spark = spark.read.csv("./datasets/us-cities-demographics.csv", sep=";", header=True)

In [3]:
# Check columns of the dataset
us_spark.columns

['City',
 'State',
 'Median Age',
 'Male Population',
 'Female Population',
 'Total Population',
 'Number of Veterans',
 'Foreign-born',
 'Average Household Size',
 'State Code',
 'Race',
 'Count']

In [4]:
# Check us_spark dataset for repeated rows, and which columns cause the duplicates
us_spark.select("City", "State", "Median Age", "Male Population", "Female Population", "Total Population", \
                "Foreign-born", "Average Household Size").orderBy("City").show()

+-------+----------+----------+---------------+-----------------+----------------+------------+----------------------+
|   City|     State|Median Age|Male Population|Female Population|Total Population|Foreign-born|Average Household Size|
+-------+----------+----------+---------------+-----------------+----------------+------------+----------------------+
|Abilene|     Texas|      31.3|          65212|            60664|          125876|        8129|                  2.64|
|Abilene|     Texas|      31.3|          65212|            60664|          125876|        8129|                  2.64|
|Abilene|     Texas|      31.3|          65212|            60664|          125876|        8129|                  2.64|
|Abilene|     Texas|      31.3|          65212|            60664|          125876|        8129|                  2.64|
|Abilene|     Texas|      31.3|          65212|            60664|          125876|        8129|                  2.64|
|  Akron|      Ohio|      38.1|          96886| 

In [5]:
# Check subset of `US` dataset that maybe causing dupliate rows
us_spark.select("City","State Code","Race","Count").orderBy("City").show()

+-------+----------+--------------------+------+
|   City|State Code|                Race| Count|
+-------+----------+--------------------+------+
|Abilene|        TX|American Indian a...|  1813|
|Abilene|        TX|  Hispanic or Latino| 33222|
|Abilene|        TX|               White| 95487|
|Abilene|        TX|               Asian|  2929|
|Abilene|        TX|Black or African-...| 14449|
|  Akron|        OH|               White|129192|
|  Akron|        OH|  Hispanic or Latino|  3684|
|  Akron|        OH|Black or African-...| 66551|
|  Akron|        OH|               Asian|  9033|
|  Akron|        OH|American Indian a...|  1845|
|Alafaya|        FL|  Hispanic or Latino| 34897|
|Alafaya|        FL|               Asian| 10336|
|Alafaya|        FL|               White| 63666|
|Alafaya|        FL|Black or African-...|  6577|
|Alameda|        CA|               White| 44232|
|Alameda|        CA|American Indian a...|  1329|
|Alameda|        CA|Black or African-...|  7364|
|Alameda|        CA|

In [7]:
## Printing all columns to check again
us_spark.select("City", "State", "Median Age", "Male Population", "Female Population", "Total Population", \
                "Number of Veterans", "Foreign-born", "Average Household Size", "State Code", \
                "Race", "Count").orderBy("City").show()

#  ['City',
#  'State',
#  'Median Age',
#  'Male Population',
#  'Female Population',
#  'Total Population',
#  'Number of Veterans',
#  'Foreign-born',
#  'Average Household Size',
#  'State Code',
#  'Race',
#  'Count']

+-------+----------+----------+---------------+-----------------+----------------+------------------+------------+----------------------+----------+--------------------+------+
|   City|     State|Median Age|Male Population|Female Population|Total Population|Number of Veterans|Foreign-born|Average Household Size|State Code|                Race| Count|
+-------+----------+----------+---------------+-----------------+----------------+------------------+------------+----------------------+----------+--------------------+------+
|Abilene|     Texas|      31.3|          65212|            60664|          125876|              9367|        8129|                  2.64|        TX|American Indian a...|  1813|
|Abilene|     Texas|      31.3|          65212|            60664|          125876|              9367|        8129|                  2.64|        TX|  Hispanic or Latino| 33222|
|Abilene|     Texas|      31.3|          65212|            60664|          125876|              9367|        8129| 

In [22]:
# Cleaning the dataset --- Approach:
## - 'Race' and 'Count' columns are the cause of duplicate rows. Hence, the approach is to separate them into their own dataset, and include "City" and "State Code" columns for reference
## - Cleaned-up US dataset can be joined back into us_race_cnt dataset to, eventually, contain unique rows
us_race_cnt = (us_spark.select("City", "State Code", "Race", "Count")
            .groupBy(us_spark.City, "State Code")
            .pivot("Race")
            .agg(first("Count")))

NameError: name 'second' is not defined

In [21]:
## Check us_race_cnt dataset
us_race_cnt.orderBy("City").show()

+------------+----------+---------------------------------+-----+-------------------------+------------------+------+
|        City|State Code|American Indian and Alaska Native|Asian|Black or African-American|Hispanic or Latino| White|
+------------+----------+---------------------------------+-----+-------------------------+------------------+------+
|     Abilene|        TX|                             1813| 2929|                    14449|             33222| 95487|
|       Akron|        OH|                             1845| 9033|                    66551|              3684|129192|
|     Alafaya|        FL|                             null|10336|                     6577|             34897| 63666|
|     Alameda|        CA|                             1329|27984|                     7364|              8265| 44232|
|      Albany|        GA|                              445|  650|                    53440|              1783| 17160|
|      Albany|        NY|                             16

In [23]:
# Compare datasets - before and after dropping duplicate rows
(us_race_cnt.count(), us_race_cnt.dropDuplicates().count())

(596, 596)