## Portfolio: Dataset Name - Exploratory Data Analysis ##


**Problem Statement:** 

**Stakeholders:** 

## Part A. Import Cleaned Dataset ##

In [None]:
# Import all libraries #

# Authorization #
__author__ = "Taesun Yoo"
__email__ = "yoots1988@gmail.com"

In [None]:
#################################
# Part 2 - Discover the Problem #
#################################
# Write a group of funtions:
def load_file(file):
    '''load input CSVs as a dataframe '''
    return pd.read_csv(file)

def clean_data(raw_df):
    '''remove rows that contain invalid data or duplicate IDs'''
    clean_df = raw_df.drop_duplicates(subset='row_id')
    return clean_df

def EDA_missing_data(cleaned_df):
    '''Performs missing % on each column '''
    missing_df = cleaned_df.isnull().sum()
    missing_df = pd.DataFrame(missing_df, columns=['count'])
    missing_df['pct'] = missing_df['count']/len(cleaned_df)
    missing_df = missing_df.sort_values(by='pct', ascending=False)
    return missing_df

def EDA_numerical_data(cleaned_df):
    '''Computes summary statistics on numerical data'''
    summary_df_num = cleaned_df.describe(include='all').T
    summary_df_num = pd.DataFrame(summary_df_num)[['count', 'std', 'min', 'mean', 'max', '25%', '50%', '75%']]
    return summary_df_num

def EDA_categorical_data(cleaned_df):
    '''Computes summary statitics on categorical data'''
    summary_df_cat = cleaned_df.describe(include=['O'])
    summary_df_cat = pd.DataFrame(summary_df_cat)
    return summary_df_cat
    
def EDA_pivot_table(cleaned_df, cat_var, num_var):
    '''Creates a pivot table based on categorical var and average numerical var'''
    pivot_cat_df = cleaned_df.pivot_table(index=cat_var, values=num_var, aggfunc=np.mean)
    pivot_cat_df.reset_index(level=0, inplace=True)
    pivot_cat_df.rename(columns={cat_var:cat_var, num_var:"avg" + "_" + num_var + "_" + cat_var}, inplace=True)
    return pivot_cat_df
    
def EDA_plot_hist_label(df, cat_var, bins, lab_list):
    '''split dataframe by category and plot a histogram'''
    for i in lab_list:
        df_by_label = df['poverty_rate'][df[cat_var] == i]
        plt.hist(df_by_label, bins=bins, label=i)
        plt.title('Histogram of Poverty Rate')
        plt.xlabel('Poverty Rate')
        plt.ylabel('# of US counties')                   

def EDA_plot_hist_2by2(df, 
                       var1, bin1, lab1, 
                       var2, bin2, lab2, 
                       var3, bin3, lab3,
                       var4, bin4, lab4, 
                       factor=None):
    '''Print skewness and plot the histogram'''
    plt.figure(figsize=(8,8))
    plt.subplots_adjust(hspace=1/2, wspace=1/2)
    #subplot 1:
    print("Skewness is:" + lab1, df[var1].skew())
    plt.subplot(2,2,1)
    plt.hist(df[var1]*factor, color='green', bins=bin1)
    plt.title('Histogram of '+ lab1)
    plt.xlabel(lab1)
    plt.ylabel('# of US counties')
    #subplot 2:
    print("Skewness is:" + lab2, df[var2].skew())
    plt.subplot(2,2,2)
    plt.hist(df[var2]*factor, color='blue', bins=bin2)
    plt.title('Histogram of '+ lab2)
    plt.xlabel(lab2)
    plt.ylabel('# of US counties')
    #subplot 3:
    print("Skewness is:" + lab3, df[var3].skew())
    plt.subplot(2,2,3)
    plt.hist(df[var3]*factor, color='cyan', bins=bin3)
    plt.title('Histogram of '+ lab3)
    plt.xlabel(lab3)
    plt.ylabel('# of US counties')
    #subplot 4:
    print("Skewness is:" + lab4, df[var4].skew())
    plt.subplot(2,2,4)
    plt.hist(df[var4]*factor, color='purple', bins=bin4)
    plt.title('Histogram of '+ lab4)
    plt.xlabel(lab4)
    plt.ylabel('# of US counties')
    
def EDA_plot_freq_chart(df, cat_var, var_name):
    '''computes frequency count chart'''
    cat_var_count = df[cat_var].value_counts()
    sns.barplot(cat_var_count.index, cat_var_count.values, alpha=0.9)
    plt.title('Frequency Counts of '+ var_name)
    plt.ylabel('Counts')
    plt.xlabel(var_name, fontsize=10)
    plt.xticks(rotation=270)
    plt.show()

def EDA_plot_bar(cleaned_df, cat_var, num_var, color):
    '''Plots the bar chart'''
    cleaned_df.plot.bar(color=color)
    plt.xlabel(cat_var)
    plt.ylabel('Avg. ' + num_var)
    plt.xticks(rotation=0)
    plt.show()    

def EDA_plot_box_whisker(df, num_var, cat_var, hue=None):
    '''plot the box-whisker plot'''
    df.sort_values(by=[num_var, cat_var], ascending=False, inplace=True)
    plt.figure()
    sns.set(style='whitegrid')
    sns.boxplot(cat_var, num_var, hue, df)
    plt.title('Box Plot of '+ num_var + ' by '+ cat_var)
    plt.xticks(rotation=270, fontsize=9)

def EDA_convert_object_to_cat(df):
    '''convert data type object to category'''
    for col in df.columns:
        if df[col].dtype.name == "object":
            df[col] = df[col].astype('category')

def EDA_encode_cat_var(df, col):
    '''encode the categorical variables using avg. salary for each category '''
    cat_dict={}
    cats = df[col].cat.categories.tolist()
    for cat in cats:
        cat_dict[cat] = df[df[col] == cat]['poverty_rate'].mean()
    df[col] = df[col].map(cat_dict)

def EDA_plot_corr_matrix(df, features, label):
    '''plot the correlation matrix'''
    corr = df[features + label].corr()
    # Create a mask:
    mask = np.zeros_like(corr)
    mask[np.triu_indices_from(mask)] = True
    plt.figure(figsize=(12,10))
    sns.heatmap(corr,
                cmap=sns.diverging_palette(220, 10, as_cmap=True),
                annot=True, fmt=".2f", mask=mask)
    plt.xticks(rotation=90)
    plt.show()

def EDA_plot_crosstab(df, cat_var1, cat_var2):
    '''plot a cross-tabulate on two categorical variables'''
    cross_tab = pd.crosstab(df[cat_var1], df[cat_var2])
    return cross_tab

def EDA_plot_scatter(df, 
                     var1, lab1, c1,
                     var2, lab2, c2, 
                     factor=None):
    '''plot 2 by 1 scatter plots'''
    plt.figure(figsize=(8,8))
    plt.subplots_adjust(hspace=0.4, wspace=0.9)
    plt.subplot(2,1,1)
    plt.scatter(df[var1]*factor, df['poverty_rate'], color=c1)
    plt.title('Relationship between ' + lab1 + ' and Poverty Rate')
    plt.xlabel(lab1)
    plt.ylabel('Poverty Rate')

    plt.subplot(2,1,2)
    plt.scatter(df[var2]*factor, df['poverty_rate'], color=c2)
    plt.title('Relationship between '+ lab2 + ' and Poverty Rate')
    plt.xlabel(lab2)
    plt.ylabel('Poverty Rate')
    
def convert_data_type_category(df, var_name):
    df_eda[var_name] = df_eda[var_name].astype('str')
    return df_eda[var_name]
    
def split_dataframe_by_string(df, cat_var, str_val):
    '''split dataframe by a specified string value in categorical variable'''
    df_str = df[df[cat_var].str.contains(str_val, case=True, regex=False)]
    return df_str

def EDA_plot_multi_facet_scatter(df1, df2, 
                                 var1, lab1, 
                                 var2, lab2,
                                 response, factor):
    '''plot multi-faceted scatter plot by county class'''
    f, (ax1, ax2)=plt.subplots(1, 2, sharey=True, figsize=(8,4))
    plt.subplots_adjust(hspace=0.2, wspace=0.2)
    plt.tight_layout(pad=0.4, w_pad=1.5, h_pad=1.0)
    ax1.scatter(df1[var1]*factor, df1[response], label='Nonmetro', edgecolor='w')
    ax1.scatter(df2[var1]*factor, df2[response], label='Metro', edgecolor='w')
    ax1.legend(loc='upper right')
    ax1.set_xlabel(lab1, fontsize=10)
    ax1.set_ylabel(response, fontsize=10)
    ax1.grid(False)
    
    ax2.scatter(df1[var2]*factor, df1[response], label='Nonmetro', edgecolor='w')
    ax2.scatter(df2[var2]*factor, df2[response], label='Metro', edgecolor='w')
    ax2.legend(loc='upper right')
    ax2.set_xlabel(lab2, fontsize=10)
    ax2.set_ylabel(response, fontsize=10)
    ax2.grid(False)

def EDA_plot_color_sc_scatter(df, var1, lab1,
                              var2, lab2,
                              var3, lab3, response):
    '''plot color scaled scatter plots'''
    # figure 1: subplot 1
    f, (ax1, ax2) = plt.subplots(1,2, sharey=True, figsize=(10,6))
    s1 = ax1.scatter(df[var1], df[var3], c=df[response],
                     cmap=plt.cm.coolwarm, edgecolor='w')
    ax1.set_xlabel(lab1, fontsize=14)
    ax1.set_ylabel(lab3, fontsize=14)
    ax1.grid(False)
    # figure 2: subplot 2    
    ax2.scatter(df[var2], df[var3], c=df[response],
                     cmap=plt.cm.coolwarm, edgecolor='w')
    ax2.set_xlabel(lab2, fontsize=14)
    ax2.set_ylabel(lab3, fontsize=14)
    ax2.grid(False)
    # lenged: color bar scaled by confounding factor
    plt.subplots_adjust(bottom=0.1, right=0.8, top=0.9)
    cax=plt.axes([0.85, 0.1, 0.05, 0.8])
    cb=f.colorbar(s1, cax=cax)
    cb.set_label(response)
    
#def join_data(df1, df2, key=None, left_index=False, right_index=False):
#    '''performs inner join to return records exist in both dataframes'''
#    return pd.merge(df1, df2, how='inner', on=key, left_index=left_index, right_index=right_index)

#def drop_row_by_index(df, idx_list):
#    df_row_drop = df.drop(df.index[idx_list])
#    return df_row_drop

#def drop_column_by_index(df, col_list):
#    df_column_drop = df.drop([col_list], axis=1)
#    return df_column_drop

In [None]:
# --- 3. Load the data --- #
# Define input CSVs:


# Define type of variables list:

# Define variables to drop

# Load data

# Metadata of dataframe: EDA

## Part B. Exploratory Data Analysis ##

#### Dataset Name: Training Set ###
Data exploration is conducted on a cleaned training set. The main goal of this phase is to explore any interesting relationships among features and identify which features are good predictors on poverty rate predictions.

Following set of questions are asked:
1. Can I count something interesting?
2. Can I find some trends (increase or decrease and any anomalies)?
3. Can I plot a bar chart or a histogram?
4. Can I make a scatter plot?

These set of guiding questions will help us to explore any insights and tell a compelling story about the US poverty dataset.

In [None]:
# compute top 10 rows on a eda_dataframe:
    
# check duplicates:

#---- Compute % of Missing Data ----#

In [None]:
#---- Compute Summary Statistics: numerical data ----#

In [None]:
#---- Compute Summary Statistics: categorical data ----#

In [None]:
#---- Visualize response variable ----#


### Summary: Poverty Rate ###
Visualize the response variable - poverty rate. First, the box plot shows that mean of poverty rate is somewhere around 15. With a lot of outliers outsides of the UB approx. 32. Second, the histogram shows the distribution of poverty rate is quite close to normal distribution. It seems a little bit right skewed (positive direction).

In [None]:
#--- Use IQR to detect potential outliers ----#

# Check LB Outliers:

# Check UB Outliers:

# check potential outliers by categorical vars:

#--- Check the suspicious outliers by an economic typology: mining-dependent

In [None]:
#---- Plot histograms ----#
# Create a list of economic typology:

# Plot multiple histograms on poverty rate by economic type:


### Histogram of Poverty Rate by Different Types of Economic Typology ###
Histogram shows that there is quite a lot of overlap between different economic typologies in lower bound area of poverty rate. In comparision, less overlap of poverty rates are prominent between different economic typologies.

In [None]:
# Plot 2by2 histogram as a subplot: demographic

### Summary: Economic Indicators ###
1. Labor: showed distribution is not quite normal. Majority of US counties have civilian labor more than 50%.
2. Unemployment rate: showed distribution skewed to right. Majority of US counties have unemployment rate at 5%.
3. Uninsured children: showed distribution skewed to right. Majority of US counties have uninsured children less than 10%.
4. Uninsured adults: showed distribution closer to normal. Majority of US counties have uninsured adults more than 20%.

In [None]:
# Plot 2by2 histogram as a subplot: health indicators

### Summary: Health Indicators ###
1. Obesity: showed distribution closes to normal. Majority of US counties have adult obesity greater than 30%.
2. Adult smoking: showed distribution skewed to right. More than half of US counties smoking less than 25%.
3. Diabetes: showed distribution closes to normal. Majority of US counties have diabetes less than 10%.
4. Excessive drinking: showed distribution skewed to right. Majority of US counties have excessive drinking less than 15%.

In [None]:
# Plot 2by2 histogram as a subplot: education

### Summary: Education Indicators ###
1. No high school diploma: showed distribution skewed to right. Majority of US counties have adults with no high school diploma less than 15%.
2. With high school diploma: showed distribution closes to normal. More than half of US counties have high school diploma greater than 30%.
3. With some college: showed distribution closes to normal. Majority of US counties have adults with some college education greater than 30%.
4. With university degree: showed distribution skewed to right. Majority of US counties have university degree less than 20%.

In [None]:
#---- Plot bar chart(s) ----#
# Plot bar chart: economic typology

### Summary: Frequency Chart - Economic Typology ###
1. Non-specialized economic counties were top among the US population.
2. Mining-dependent counties were the lowest counts among the US population.

In [None]:
#---- Plot box-whisker plot chart(s) ----#

### Summary: Box-Whisker Plots by Categorical Variables ### 
1. Economic typology: federal state-dependent counties showed the highest poverty rate (19.5%). On contrary, farming-dependent counties had the lowest poverty rate (14%).
2. Urbanization degree: with increased in population size from small micropolitan to large metropolitan counties, poverty rates were decreased.
3. Urban-rural classification: from non-metro to metro counties, the poverty rates were going down (17.5% vs. 12.5%).

In [None]:
#---- Convert categorical variable data type from object to category ----#


#---- Encode categorical variables using avg. salary for each category to replace label ----#
    
#---- Plot correlation matrix chart ----#
# Define list of features and salary


### Summary: Correlation Matrix ###
We can conclude from above that % civilian labor is the most strongly correlated with poverty rate, followed by % unemployment, % low birth weight, % diabetes and % uninsured adults.

Among the features, following trends were recongized:
1. Urban influential size and urban area concentration have a strong degree of positive correlation (0.88).
2. % below 18 year old and Birth rate per 1k have a strong degree of positive correlation (0.73).
3. % Uninsured adults and Uninsured children have a strong degree of positive correlation (0.72).

In [None]:
#---- Plot a cross-tabulate based on two categorical variables ----#    


### Summary: Cross Tabulation ###
1. With respect to non-specialized: the largest number of counties were non-specialized and non-metro with population size of 2,500 to 19,999. Followed by non-specialized and metro counties with population over 1 million.
2. With respect to farm-dependent: most of counties were dependent on farming industries in non-metro with less than 2,500 population (rural area).
3. In mid-size non-metro with population of 2,500 to 19,999, most of counties were dependent on manufacturing and mining.
4. In small size non-metro with population of 2,500 to 19,999 were mostly dependent on federal/state government funding.

In [None]:
#---- Plot a scatter plot: numerical and categorical variables ----#
# Demographics


### Summary: Scatter Plot - economic indicators vs. poverty rate
1. % civilian labor: a strong trend of increased in labor power, decreased in poverty rate was observed.
2. % uninsured adults: a trend of increased in uninsured adults, decreased in poverty rate was observed.

In [None]:
# Health indicators


### Summary: Scatter Plot - health indicators vs. poverty rate
1. % excessive drinking: higher the excessive drinking %, slight decreased in poverty rates were observed.
2. % low birthweight: higher the low birthweight %, increased in poverty rates were observed.

In [None]:
# Education indicators


### Summary: Scatter Plot - education indicators vs. poverty rate ###
1. % university education: increased in university degree %, lower poverty rates were observed.
2. % no high school diploma: increased in % of no high school diploma, higher poverty rates were observed.

In [None]:
#---- Plot multi-faceted scatter plots by categorical variable ----#


In [None]:
#---- Plot color scaled scatter plots by numerical variable ----#


### Summary: Multi-faceted Scatter Plot ###
% no high school diploma vs. % university degree: these top two plots showed that higher the education level of population, increased in % of civilian labor. Thus, poverty rates were found to be lower.