In [497]:
#Independent Data Science Project Part 1 by Zara Clacken
#Data Source: "Children in single-parent families by race in the United States", Kids Count Data Center,  https://bit.ly/2O5A5tZ
#Research Question: What percentage of the total population of children living in single parent homes during the 2010s in the United States
#does each race make up? 

In [498]:
#Import Packages
import warnings
warnings.filterwarnings('ignore')  
import numpy as np
import pandas as pd 

In [499]:
#Import Raw Data Set
singleparentkidsraw = pd.read_excel('C:\\Users\zara\Documents/Data Science Portfolio_Zara Clacken/Raw Data/Children in single-parent families by race.xlsx') 
singleparentkidsraw

Unnamed: 0,LocationType,Location,Race,TimeFrame,DataFormat,Data
0,Nation,United States,American Indian,2005,Percent,0.49
1,Nation,United States,Asian and Pacific Islander,2005,Percent,0.17
2,Nation,United States,Black or African American,2005,Percent,0.65
3,Nation,United States,Hispanic or Latino,2005,Percent,0.36
4,Nation,United States,Non-Hispanic White,2005,Percent,0.23
...,...,...,...,...,...,...
11077,Territory,Puerto Rico,Non-Hispanic White,2019,Number,S
11078,Territory,Puerto Rico,Hispanic or Latino,2019,Number,341000
11079,Territory,Puerto Rico,Black or African American,2019,Number,44000
11080,Territory,Puerto Rico,Asian and Pacific Islander,2019,Number,S


In [500]:
#Column Names?
singleparentkidsraw.columns




Index(['LocationType', 'Location', 'Race', 'TimeFrame', 'DataFormat', 'Data'], dtype='object')

In [501]:
#Number of Rows?
len(singleparentkidsraw)

11082

---

In [502]:
#Change Column Names
singleparentkidsraw.columns = ['LocationType', 'Location', 'Race', 'Year', 'DataFormat', 'TotalChildren']
singleparentkidsraw.columns

Index(['LocationType', 'Location', 'Race', 'Year', 'DataFormat',
       'TotalChildren'],
      dtype='object')

---

In [503]:
#What are the Unique Values of each Column?
singleparentkidsraw.LocationType.unique()     

array(['Nation', 'State', 'City', 'Territory'], dtype=object)

In [504]:
singleparentkidsraw.DataFormat.unique()    

array(['Percent', 'Number'], dtype=object)

In [505]:
singleparentkidsraw.Race.unique()   

array(['American Indian', 'Asian and Pacific Islander',
       'Black or African American', 'Hispanic or Latino',
       'Non-Hispanic White', 'Two or more races', 'Total'], dtype=object)

In [506]:
singleparentkidsraw.Year.unique()   

array([2005, 2006, 2007, 2008, 2013, 2018, 2009, 2010, 2015, 2011, 2012,
       2014, 2016, 2017, 2019], dtype=int64)

---

In [None]:
#Drop LocationType, Location, DataFormat, and Data Columns
#Filter Data Set to show the years 2010-2019 

YearFilter1 = singleparentkidsraw.Year > 2009  
YearFilter2 = singleparentkidsraw.Year < 2020
FormatFilter = singleparentkidsraw.DataFormat == 'Number'
LocationFilter = singleparentkidsraw.LocationType == 'Nation'
TotalFilter = singleparentkidsraw.Race != 'Total'

singleparentkids = singleparentkidsraw[YearFilter1][YearFilter2][FormatFilter][LocationFilter][TotalFilter].drop(['LocationType','Location','DataFormat'], axis = 1)

singleparentkids.tail(11)

In [None]:
#Race Values?
singleparentkids.Race.unique()

In [None]:
#Year Values the Same?
singleparentkids.Year.unique()

In [None]:
#How many rows in data frame now?
len(singleparentkids)

---

In [None]:
#Check Variable Data Types
singleparentkids.info()

In [None]:
#Change Variable Data Types
singleparentkids.Year = singleparentkids.Year.astype('category')
singleparentkids.Race = singleparentkids.Race.astype('category')
singleparentkids.TotalChildren = singleparentkids.TotalChildren.astype('int64')

In [None]:
#Check Data Type Changes
singleparentkids.info()

---

In [None]:
singleparentkids.head(6)

In [None]:
singleparentkids.Race.unique()

In [None]:
singleparentkids.Year.unique()

---

In [None]:
#Percentage Each Race is of Total Single Parent Children

In [None]:
#Total Number of Children of All Races Per Year

yearlist = [2013, 2018, 2010, 2011, 2015, 2012, 2014, 2016, 2017, 2019]

for Y in yearlist:
    S = singleparentkids[singleparentkids.Year == Y].TotalChildren.sum()
    print(Y,':',S, ',', end =' ')

In [None]:
#Create a Dictionary For the Sums 
Sums = {2013 : 25404000 , 2018 : 24832000 , 2010 : 25011000 , 2011 : 25434000 , 2015 : 25216000 , 2012 : 25486000 , 2014 : 25469000 ,
        2016 : 25088000 , 2017 : 24823000 , 2019 : 24619000} 
Sums[2015]

In [None]:
#Create New Column for Percentage of a Year's Total Population 
singleparentkids['PercentofTotalPopulation'] = " "
singleparentkids

In [None]:
len(singleparentkids)

In [None]:
#Create a function, using the dictionary, to calculate percentages 
import numpy as np
percentages = []
for Y in yearlist:
    percentages.append(np.round(100*(singleparentkids[singleparentkids.Year == Y].TotalChildren/Sums[Y]),2))
P = np.array(percentages).reshape(60,1)
P

In [None]:
#Insert Percent Column into Data Frame
singleparentkids['PercentofTotalPopulation'] = P 
singleparentkids

---

In [None]:
#Import Seaborn 
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
#Unique Races
singleparentkids.Race.unique().tolist()


In [None]:
#Replace American Indian with Native American 
singleparentkids.Race = singleparentkids.Race.replace('American Indian','Native American')
singleparentkids

In [None]:
#Replace Non-Hispanic White with White
singleparentkids.Race = singleparentkids.Race.replace ('Non-Hispanic White','White')
singleparentkids

In [None]:
#Replace Two or more races with Multiracial
singleparentkids.Race = singleparentkids.Race.replace ('Two or more races','Multiracial')
singleparentkids

In [None]:
#Replace Hispanic or Latino with Hispanic
singleparentkids.Race = singleparentkids.Race.replace ('Hispanic or Latino','Hispanic')
singleparentkids

In [None]:
#Replace Black or African-American with Black
singleparentkids.Race = singleparentkids.Race.replace ('Black or African American','Black')
singleparentkids

In [None]:
#Create Plot 
 

#Create Lists and Dictionaries
racelist = sorted(['Native American', 'Asian and Pacific Islander', 'Black', 'Hispanic', 'White', 'Multiracial'])
LineColor = {'Native American':'Red', 'Asian and Pacific Islander': 'Gold', 'Black':'Black',
             'Hispanic':'Orange', 'White': 'Pink', 'Multiracial': 'Brown' }
LineMarker = {'Native American':'s', 'Asian and Pacific Islander': 'o', 'Black':'^',
             'Hispanic':'s', 'White': 'o', 'Multiracial': '^' }




#Define plot's function 
def kidsplot(racelist):
    for race in racelist:
        sns.lineplot(data = singleparentkids[singleparentkids.Race == race], x = 'Year', y = 'PercentofTotalPopulation', label = race,\
                     c= LineColor[race], marker = LineMarker[race], markersize = 10)
    plt.legend(loc ='upper left', bbox_to_anchor = (1,1), prop = {'size': 12})
    plt.title("U.S. Children in Single-Parent Homes by Race", fontsize = 15)
    plt.ylabel('Percent of Total Single-Parent Home Population(%)', fontsize = 12)
    plt.xlabel('Year', fontsize = 12)
    plt.ylim(0,50)
    
    plt.show


    
#Plot Size
plt.rcParams['figure.figsize'] = (10,5)
    
#Print Plot 
kidsplot(racelist)




---

In [None]:
singleparentkids.columns

In [None]:
#Change Column Names Again For Export to be joined with other dataframes in SQL
singleparentkids.columns = ['Race', 'Year', 'Number of Children(Single Parent Homes)', 'Percent of Total Children(Single Parent Homes)']
singleparentkids

In [None]:
#Export Single Parent Kids Data Frame as a CSV
singleparentkids.to_csv('C:\\Users\zara\Documents/Data Science Portfolio_Zara Clacken/Created CSV Files/2010-2019_Single_Parent_Children_By_Race.csv')