In [17]:
#!pip install packagename
# importing modules
import geopandas as gpd
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from os import chdir as cd
import time
import fiona
from datetime import date
import kaleido

In [18]:
# read data from csv
df_main = pd.read_csv(r'D:\Work\Box Sync\Trends_all states\Output from Analysis\df_attributes.csv', 
                      dtype = {'GEOID':str}, index_col = 0)
# df.columns

In [19]:
# Checking data types
# df_main.shape, df_main.dtypes, 

In [20]:
df_main['REGION'].value_counts()

REGION
South                    10504
Midwest                  10213
West                      6063
Northeast                 4816
Outside Contiguous US      292
Name: count, dtype: int64

# Convert continuous to categorical variable

In [21]:
df = df_main.copy()

# ======== To fill null values for the binned column ========
 
def replace_nan_by_noData(dataframe, column_name):
    # Creating a Series
    df_to_series = pd.Series(dataframe[column_name],dtype='category')
    # Filling NaN values in this series
    added_no_data = df_to_series.cat.add_categories("no data").fillna("no data")
    return added_no_data


In [22]:
# Binning continuous variable to categorical for to plot parallel category plot and easier visibility
# for Binning, we are using tax brackets from 2023 IRS data
# Source: https://www.irs.gov/newsroom/irs-provides-tax-inflation-adjustments-for-tax-year-2023
df['hh_income_by_taxRate'] = pd.cut(df['median_income'], 
                                    [0, 15700, 59850, 95350, 182000, 231250, 1000000],
                                    labels=["10%", "12%", "22%", "24%", "32%", "35%+"])
print('Null values:---')
print(df['hh_income_by_taxRate'].isnull().sum())

df['hhincome by taxRate'] = replace_nan_by_noData(df, 'hh_income_by_taxRate')

Null values:---
3312


In [23]:
# Binning continuous variable to categorical for to plot parallel category plot and easier visibility
# for Binning, we are using tax brackets from 2023 IRS data
df['One or fewer vehicles'] = pd.cut(df['veh_<=_1'], [-1, 10, 20, 30, 40, 50, 100],
                            labels=["0-10", "10-20", "20-30", "30-40", "40-50", "50+"])

df['One or fewer vehicles'] = replace_nan_by_noData(df, 'One or fewer vehicles')

df['Two or more vehicles'] = pd.cut(df['veh_2_or+'], [-1, 10, 20, 30, 40, 50, 105],
                            labels=["0-10", "10-20", "20-30", "30-40", "40-50", "50+"])

df['Two or more vehicles'] = replace_nan_by_noData(df, 'Two or more vehicles')

In [24]:
df['REGION'].value_counts()

REGION
South                    10504
Midwest                  10213
West                      6063
Northeast                 4816
Outside Contiguous US      292
Name: count, dtype: int64

In [25]:
print('Cities in Region 5: Outside Contiguous US ')
print(df[df['REGION'] == 'Outside Contiguous US']['STATEFP'].value_counts())

# Excluding cities in Region 5 since that is not available in forecasted values
df = df[df['REGION'] != 'Outside Contiguous US']
print(f'Remaining cities after excluding Region 5: {df.shape}')


# df = df.sort_values(by = ['REGION', ])

Cities in Region 5: Outside Contiguous US 
STATEFP
72    292
Name: count, dtype: int64
Remaining cities after excluding Region 5: (31596, 40)


# hh vehicle ownership vs Population trend by REGION

In [54]:
trend_col =  'future trend from SSP 4'

df['REGION_NO'] = df['REGION'].map({'Northeast': 1, 'Midwest': 2,'West': 3, 'South': 4})
# Create dimensions
region_dim = go.parcats.Dimension(
    values=df["REGION"], label="REGION",
    categoryarray=['Northeast', 'Midwest','West', 'South'],
)

veh_1_dim = go.parcats.Dimension(
    values=df['One or fewer vehicles'], categoryorder="category ascending", label= '% pop with One/fewer vehicles'
)

veh_2_dim = go.parcats.Dimension(
    values=df['Two or more vehicles'], categoryorder="category ascending", label= '% pop with Two/more vehicles'
)

city_dim = go.parcats.Dimension(values=df["city type"], label="city type",
                               categoryarray=['urban', 'suburban' , 'periurban', 'rural', 'not enough data'],
                               ticktext=['urban', 'suburban' , 'periurban', 'rural', 'not enough data'])

future_trend_dim = go.parcats.Dimension(
    values=df[trend_col], label=trend_col,
    categoryarray = ['increasing', 'no trend' , 'decreasing'],
)


# Build colorscale
color = [x for x in df['REGION_NO']]
colorscale = ['#8C564B', '#9467BD', '#F7B6D2', '#DBDB8D'] # ['darkseagreen', 'orchid', 'plum', 'lightsteelblue'] # ['goldenrod','forestgreen','lightblue', 'slateblue', 'darkmagenta']
# create figure object
fig = go.Figure(
    data=[
        go.Parcats(
            dimensions=[
                region_dim,
                veh_1_dim,
                veh_2_dim,
                future_trend_dim,
                city_dim,
            ],
            line={"color": color, "colorscale": colorscale, 'shape': 'hspline'},
            hoveron="color",
            hoverinfo="count + probability",
            labelfont={"size": 10,"family": "arial"},
            tickfont={"size":  10,"family": "arial"},
            arrangement="freeform",
        )
    ]
)

fig.update_layout(
    height=300,
    width=700,
    font=dict(size=10, ),
    margin=dict(l=10, r=18, t=15, b=12))

fig.write_image(r"D:\Work\Box Sync\NC Figures\\veh_ownership_" + str(trend_col) + ".png", scale=4, engine="kaleido")

# Income vs Population trend by REGION

In [31]:
## Creating parallel categores chart

trend_col =  'future trend from SSP 4'
df['REGION_NO'] = df['REGION'].map({'Northeast': 1, 'Midwest': 2,'West': 3, 'South': 4})
# Create dimensions
# stalk-shape
income_dim = go.parcats.Dimension(
    values=df['hhincome by taxRate'], categoryorder="category ascending", label= 'hhincome by taxRate'
)
# stalk-root
city_dim = go.parcats.Dimension(values=df["city type"], label="city type",
                               categoryarray=['urban', 'suburban' , 'periurban', 'rural', 'not enough data'],
    ticktext=['urban', 'suburban' , 'periurban', 'rural', 'not enough data'])
# stalk-surface-above-ring
future_trend_dim = go.parcats.Dimension(
    values=df[trend_col], label=trend_col,
    categoryarray = ['increasing', 'no trend' , 'decreasing'],
)
# stalk-surface-below-ring
region_dim = go.parcats.Dimension(
    values=df["REGION"], label="REGION",
    categoryarray=['Northeast', 'Midwest','West', 'South'],
)

# Build colorscale
color = [x for x in df['REGION_NO']]
colorscale = ['#8C564B', '#9467BD', '#F7B6D2', '#DBDB8D'] # ['darkseagreen', 'orchid', 'plum', 'lightsteelblue'] # ['goldenrod','forestgreen','lightblue', 'slateblue', 'darkmagenta']
# create figure object
fig = go.Figure(
    data=[
        go.Parcats(
            dimensions=[
                region_dim,
                # city_dim,
                income_dim,
                future_trend_dim,
                city_dim,
            ],
            line={"color": color, "colorscale": colorscale, 'shape': 'hspline',},
            hoveron="color",
            hoverinfo="count + probability",
            labelfont={"size": 10,"family": "arial"},
            tickfont={"size":  10,"family": "arial"},
            arrangement="freeform",
        )
    ]
)

fig.update_layout(
    height=300,
    width=600,
    font=dict(size=10, ),
    margin=dict(l=10, r=18, t=15, b=12))

fig.write_image(r"D:\Work\Box Sync\NC Figures\median_income_" + str(trend_col) + ".png", engine="kaleido")

# Extras

In [12]:
# # # Build parcats dimensions
# # categorical_dimensions = ['REGION', 'One or fewer vehicles', 'Two or more vehicles','city type', 'future trend from SSP 4', ];

# # dimensions = [dict(values=df[label], label=label) for label in categorical_dimensions]
# trend_col =  'future trend from SSP 2'
# df['REGION_NO'] = df['REGION'].map({'Northeast': 1, 'Midwest': 2,'West': 3, 'South': 4})
# # Build colorscale
# color = [x for x in df['REGION_NO']]
# colorscale = ['#8C564B', '#9467BD', '#F7B6D2', '#DBDB8D'] # ['goldenrod','forestgreen','slateblue', 'darkmagenta',]

# # Build figure as FigureWidget
# fig = go.Figure(go.Parcats(
#         # domain={'y': [0, 0.4]}, 
#         dimensions=dimensions,
#         line={'colorscale': colorscale,
#               'color': color, 
#               'shape': 'hspline'}))

# fig.update_layout(
#     height=600,
#     width=1000,
#         # dragmode='lasso', hovermode='closest',
#         font=dict(
#         # family="Courier New, monospace",
#         size=18,
#         # color="RebeccaPurple"
#     ))

# fig
# # fig.write_html(r'D:\Work\Box Sync\PhD_Work_Uttara\NC_submission_shared\Final_submission\Final_plots\veh_ownership_populationTrend.html')
# # fig.write_image(r'D:\Work\Box Sync\Depop Paper NC\Paper Draft NC\Figures\vehicle_ownership_SSP2.png')

In [13]:

# # Build parcats dimensions
# categorical_dimensions = ['REGION','hhincome by taxRate', 'future trend from SSP 4', 'city type',];

# dimensions = [dict(values=df[label], label=label) for label in categorical_dimensions]

# # Build colorscale
# color = [x for x in df['REGION_NO']]
# colorscale = ['#8C564B', '#9467BD', '#F7B6D2', '#DBDB8D'] # ['darkseagreen', 'orchid', 'plum', 'lightsteelblue'] # ['goldenrod','forestgreen','lightblue', 'slateblue', 'darkmagenta']

# # Build figure as FigureWidget
# fig = go.Figure(go.Parcats(
#         # domain={'y': [0, 0.4]}, 
#         dimensions=dimensions,
#         line={'colorscale': colorscale,
#               'color': color, 
#               'shape': 'hspline'}))

# fig.update_layout(
#     height=400,
#     width=1000,
#         # dragmode='lasso', hovermode='closest',
#         font=dict(
#         # family="Courier New, monospace",
#         size=10,
#         # color="RebeccaPurple"
#     ))

# fig
# fig.update_traces(dimensions=[{"categoryorder": "category ascending"} for _ in dimensions])

# # fig.write_html(r'D:\Work\Box Sync\PhD_Work_Uttara\NC_submission_shared\Final_submission\Final_plots\median_income_populationTrend.html')
# # fig.write_image(r'D:\Work\Box Sync\Depop Paper NC\Paper Draft NC\Figures\median_income_SSP2.png')


In [14]:


# # for parallet plot the first column needs to be numeric values
# for i in range(len(df)):
#     # print(i)
#     if df.loc[i,'REGION'] == 'Northeast':
#         df.loc[i,'REGION_NO'] = 1
#     elif df.loc[i,'REGION'] == 'Midwest':
#         df.loc[i,'REGION_NO'] = 2 
#     elif df.loc[i,'REGION'] == 'South':
#         df.loc[i,'REGION_NO'] = 3
#     elif df.loc[i,'REGION'] == 'West':
#         df.loc[i,'REGION_NO'] = 4
#     else:
#         df.loc[i,'REGION_NO'] = 5  

# # If city type is the first variable, then---
# for i in range(len(df)):
#     # print(i)
#     if df.loc[i,'city type'] == 'urban':
#         df.loc[i,'city type'] = 1
#     elif df.loc[i,'city type'] == 'suburban':
#         df.loc[i,'city type'] = 2
#     elif df.loc[i,'city type'] == 'periurban':
#         df.loc[i,'city type'] = 3
#     elif df.loc[i,'city type'] == 'rural':
#         df.loc[i,'city type'] = 4
#     else:
#         df.loc[i,'city type'] = 5  

# # If future trend is the first variable, then---
# for i in range(len(df)):
#     # print(i)
#     if df.loc[i,'future trend from SSP 2'] == 'increasing':
#         df.loc[i,'future trend from SSP 2'] = 1
#     elif df.loc[i,'future trend from SSP 2'] == 'no trend':
#         df.loc[i,'future trend from SSP 2'] = 2
#     elif df.loc[i,'future trend from SSP 2'] == 'decreasing':
#         df.loc[i,'future trend from SSP 2'] = 3

# Check p-values 

In [15]:
df = df.reset_index(drop=True)

for i in range(len(df)):
    # print(i)
    if df.loc[i,'future trend from SSP 2'] == 'increasing':
        df.loc[i,'future trend from SSP 2'] = 1
    elif df.loc[i,'future trend from SSP 2'] == 'no trend':
        df.loc[i,'future trend from SSP 2'] = 2
    elif df.loc[i,'future trend from SSP 2'] == 'decreasing':
        df.loc[i,'future trend from SSP 2'] = 3

In [16]:
df_south = df[df['REGION'] == 'South'].reset_index(drop = True)[['future trend from SSP 2', 'median_income', 'hh_income_by_taxRate']].dropna()
df_west = df[df['REGION'] == 'West'].reset_index(drop = True)[['future trend from SSP 2', 'median_income', 'hh_income_by_taxRate']].dropna()
df_midwest = df[df['REGION'] == 'Midwest'].reset_index(drop = True)[['future trend from SSP 2', 'median_income', 'hh_income_by_taxRate']].dropna()
df_northeast = df[df['REGION'] == 'Northeast'].reset_index(drop = True)[['future trend from SSP 2', 'median_income', 'hh_income_by_taxRate']].dropna()


In [17]:
# Cross tabulation 
CrosstabResult=pd.crosstab(index=df_south['future trend from SSP 2'],columns=df_south['hh_income_by_taxRate'])
print(CrosstabResult)
# importing the required function
from scipy.stats import chi2_contingency
# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)
# P-Value is the Probability of H0 being True
# If P-Value&gt;0.05 then only we Accept the assumption(H0)
print('The P-Value of the ChiSq Test is:', ChiSqResult[1])
print('\n')
from scipy.stats import f_oneway
# Perform one-way ANOVA
f_statistic, p_value = f_oneway(*[group['median_income'].values for name, group in df_south.groupby("future trend from SSP 2")])
# Print results
print('----Statistics from one way ANOVA----')
print("F-Statistic:", f_statistic)
print("P-value:", p_value)
print('\n')
# Kuruskul Wallis Test
from scipy.stats import kruskal
#perform Kruskal-Wallis Test 
H_statistic, p_value = kruskal(*[group["median_income"].values for name, group in df_south.groupby("future trend from SSP 2")])
print('----Statistics from Kruskal-Wallis----')
print("H-Statistic:", H_statistic)
print("P-value:", p_value)

hh_income_by_taxRate     10%   12%   22%  24%  32%  35%+
future trend from SSP 2                                 
1                         11  2279  1447  659   53    31
2                          7   608   172   32    1     0
3                         39  3406   477  101    1     1
The P-Value of the ChiSq Test is: 3.0409471316678297e-263


----Statistics from one way ANOVA----
F-Statistic: 818.4671241829363
P-value: 0.0


----Statistics from Kruskal-Wallis----
H-Statistic: 1808.874071342676
P-value: 0.0


In [18]:
# Cross tabulation 
CrosstabResult=pd.crosstab(index=df_west['future trend from SSP 2'],columns=df_west['hh_income_by_taxRate'])
print(CrosstabResult)
# importing the required function
from scipy.stats import chi2_contingency
# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)
# P-Value is the Probability of H0 being True
# If P-Value&gt;0.05 then only we Accept the assumption(H0)
print('The P-Value of the ChiSq Test is:', ChiSqResult[1])
print('\n')
from scipy.stats import f_oneway
# Perform one-way ANOVA
f_statistic, p_value = f_oneway(*[group['median_income'].values for name, group in df_west.groupby("future trend from SSP 2")])
# Print results
print('----Statistics from one way ANOVA----')
print("F-Statistic:", f_statistic)
print("P-value:", p_value)
print('\n')
# Kuruskul Wallis Test
from scipy.stats import kruskal
#perform Kruskal-Wallis Test 
H_statistic, p_value = kruskal(*[group["median_income"].values for name, group in df_west.groupby("future trend from SSP 2")])
print('----Statistics from Kruskal-Wallis----')
print("H-Statistic:", H_statistic)
print("P-value:", p_value)


hh_income_by_taxRate     10%   12%   22%  24%  32%  35%+
future trend from SSP 2                                 
1                         11  1049  1186  648   42    32
2                          1   365   184   64    1     1
3                         16  1038   326   56    2     0
The P-Value of the ChiSq Test is: 1.9543788075541208e-131


----Statistics from one way ANOVA----
F-Statistic: 329.60747344330446
P-value: 3.1935135789049384e-135


----Statistics from Kruskal-Wallis----
H-Statistic: 757.8376529163778
P-value: 2.739331426855337e-165


In [19]:
# Cross tabulation 
CrosstabResult=pd.crosstab(index=df_northeast['future trend from SSP 2'],columns=df_northeast['hh_income_by_taxRate'])
print(CrosstabResult)
# importing the required function
from scipy.stats import chi2_contingency
# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)
# P-Value is the Probability of H0 being True
# If P-Value&gt;0.05 then only we Accept the assumption(H0)
print('The P-Value of the ChiSq Test is:', ChiSqResult[1])
print('\n')
from scipy.stats import f_oneway
# Perform one-way ANOVA
f_statistic, p_value = f_oneway(*[group['median_income'].values for name, group in df_northeast.groupby("future trend from SSP 2")])
# Print results
print('----Statistics from one way ANOVA----')
print("F-Statistic:", f_statistic)
print("P-value:", p_value)
print('\n')
# Kuruskul Wallis Test
from scipy.stats import kruskal
#perform Kruskal-Wallis Test 
H_statistic, p_value = kruskal(*[group["median_income"].values for name, group in df_northeast.groupby("future trend from SSP 2")])
print('----Statistics from Kruskal-Wallis----')
print("H-Statistic:", H_statistic)
print("P-value:", p_value)

hh_income_by_taxRate     10%   12%  22%  24%  32%  35%+
future trend from SSP 2                                
1                          0   325  644  542   47    31
2                          1   189  211   97    6     0
3                          4  1384  771  267   10     8
The P-Value of the ChiSq Test is: 4.116473674286151e-138


----Statistics from one way ANOVA----
F-Statistic: 368.04642682253746
P-value: 7.613507348138938e-149


----Statistics from Kruskal-Wallis----
H-Statistic: 706.4916301062717
P-value: 3.866266326937705e-154


In [20]:
# Cross tabulation 
CrosstabResult=pd.crosstab(index=df_midwest['future trend from SSP 2'],columns=df_midwest['hh_income_by_taxRate'])
print(CrosstabResult)
# importing the required function
from scipy.stats import chi2_contingency
# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)
# P-Value is the Probability of H0 being True
# If P-Value&gt;0.05 then only we Accept the assumption(H0)
print('The P-Value of the ChiSq Test is:', ChiSqResult[1])
print('\n')
from scipy.stats import f_oneway
# Perform one-way ANOVA
f_statistic, p_value = f_oneway(*[group['median_income'].values for name, group in df_midwest.groupby("future trend from SSP 2")])
# Print results
print('----Statistics from one way ANOVA----')
print("F-Statistic:", f_statistic)
print("P-value:", p_value)
print('\n')
# Kuruskul Wallis Test
from scipy.stats import kruskal
#perform Kruskal-Wallis Test 
H_statistic, p_value = kruskal(*[group["median_income"].values for name, group in df_midwest.groupby("future trend from SSP 2")])
print('----Statistics from Kruskal-Wallis----')
print("H-Statistic:", H_statistic)
print("P-value:", p_value)

hh_income_by_taxRate     10%   12%   22%  24%  32%  35%+
future trend from SSP 2                                 
1                          7  1216  1036  301   16    14
2                          4   517   292   73    1     1
3                         20  4363  1384  167   11     3
The P-Value of the ChiSq Test is: 3.821526111351939e-138


----Statistics from one way ANOVA----
F-Statistic: 370.5945085639539
P-value: 1.1727179146691057e-155


----Statistics from Kruskal-Wallis----
H-Statistic: 715.7157547722887
P-value: 3.839711067486734e-156
