In [2]:
import pandas as pd

# Load the data
def get_trends(filename):
    data = pd.read_csv(filename, low_memory=False)
    annual_emp_trends = data.groupby('year')['Emp'].sum().diff().div(data.groupby('year')['Emp'].sum().shift(1)).reset_index()
    annual_emp_trends.columns = ['year', 'Emp_Percent_Change']
    annual_emp_trends.fillna(0, inplace=True)
    annual_emp_trends.columns = ['year', 'trend']
    annual_emp_trends.to_csv('USA_trends.csv', index=False)

get_trends('./QWI_Data/QWI_USA.csv')



In [3]:
import pandas as pd

# Load the dataset
file_path = './QWI_Data/QWI_USA.csv'
df = pd.read_csv(file_path)

# Keep necessary columns
df = df[['geography', 'firmage_label.value', 'year', 'quarter', 'Emp']]

# Step 1: Creating 'firmage' column and dropping 'firmage_label.value'
firmage_map = {
    "0-1 Years": "_0_1_years",
    "2-3 Years": "_2_3_years",
    "4-5 Years": "_4_5_years",
    "All Firm Ages": "_all_years"
}
df['firmage'] = df['firmage_label.value'].map(firmage_map)
df.drop('firmage_label.value', axis=1, inplace=True)

# Step 2: Reshaping data to wide format
df_wide = df.pivot_table(index=['year', 'quarter'], columns='firmage', values='Emp', aggfunc='sum').reset_index()

# Step 3: Calculating 'YF_emp_share'
df_wide['EMP_youngfirm'] = df_wide[['_0_1_years', '_2_3_years', '_4_5_years']].sum(axis=1)
df_wide['YF_emp_share'] = df_wide['EMP_youngfirm'] / df_wide['_all_years']

# Step 4: Collapsing data to calculate mean 'YF_emp_share' by year
annual_emp_share = df_wide.groupby('year')['YF_emp_share'].mean().reset_index()

# Step 5: Calculating year-over-year growth
annual_emp_share['lagged_emp_share'] = annual_emp_share['YF_emp_share'].shift(1)
annual_emp_share['YoY_growth'] = (annual_emp_share['YF_emp_share'] - annual_emp_share['lagged_emp_share']) / annual_emp_share['lagged_emp_share']

annual_emp_share.to_csv('./QWI_Data/US_young_firm_emp_share_trend_2017_2022.csv', index=False)




In [4]:
import json
min_year = 2015
max_year = 2022
all_quarters = [f"{year}.{q}" for year in range(min_year, max_year + 1) for q in range(1, 5)]
usa_data = {
    "selected_areas": {"geo_ids": ["00"]},  # Using GEOID '00'
    "firm_attributes": {
        "naics_level": "naics2", "selected_naics": ["00"], "ownership": "op", "fas": "fa",
        "firm_age": ["1", "2", "3"], "firm_size": ["0"]},
    "worker_attributes": {"group": "se", "attr1": ["0"], "attr2": ["E1", "E2", "E3", "E4"]},
    "indicators": ["Emp"], "quarters": all_quarters, "export_labels": True, "worker_xing": "se"}
    
file_name = f"./Upload/USA_DATA.qwi"
with open(file_name, "w") as file:
    json.dump(usa_data, file, indent=4)


In [16]:
def apply_trends(df):
    trend_df = pd.read_csv('usa_trends.csv', index_col=0)
    # Convert 'Emp' to float for fractional changes
    df['Emp'] = df['Emp'].astype(float)

    # Determine the most recent year in the trend data
    most_recent_year = int(trend_df.columns[-1])  # Assuming columns are just years now
    if most_recent_year < df['year'].max():
        return df

    # Iterate through DataFrame rows
    for index, row in df.iterrows():
        current_geo = row['geography_label.value']
        target_year = row['year']

        # Find the most recent year's 'Emp' value for the same geographic location
        most_recent_emp_data = df[(df['geography_label.value'] == current_geo) &
                                  (df['year'] < target_year)].sort_values(by='year', ascending=False).iloc[0]
        current_emp = most_recent_emp_data['Emp']
        start_year = most_recent_emp_data['year']

        # Apply trends from the most recent to the target year
        while start_year < target_year:
            # Move to the next year
            start_year += 1

            # Apply trend if it exists for this year
            if start_year in trend_df.columns:
                trend = trend_df.loc[current_geo, str(start_year)]
                current_emp *= (1 + trend / 100)

        # Update the 'Emp' value for the original row
        df.at[index, 'Emp'] = current_emp
    return df


In [46]:
def transform_data(df):
    # Step 1: Creating 'firmage' column based on 'firmage_label.value'
    firmage_map = {
        "0-1 Years": "Emp_0_1_years",
        "2-3 Years": "Emp_2_3_years",
        "4-5 Years": "Emp_4_5_years",
        "All Firm Ages": "Emp_all_years"
    }
    df['firmage'] = df['firmage_label.value'].map(firmage_map)

    # Step 2: Dropping 'firmage_label.value'
    df.drop('firmage_label.value', axis=1, inplace=True)

    # Step 3: Grouping data by 'year', 'quarter', and 'firmage', then summing 'Emp'
    df_grouped = df.groupby(['year', 'quarter', 'firmage'])['Emp'].sum().reset_index()

    # Step 4: Pivoting the DataFrame to wide format
    df_wide = df_grouped.pivot(index=['year', 'quarter'], columns='firmage', values='Emp').reset_index()
    df_wide.columns.name = None

    # Ensure all expected columns are present
    expected_columns = ['Emp_0_1_years', 'Emp_2_3_years', 'Emp_4_5_years', 'Emp_all_years']
    for col in expected_columns:
        if col not in df_wide:
            df_wide[col] = 0

    # Step 5: Creating 'emp_0_5_years' as the sum of the specified columns
    df_wide['emp_0_5_years'] = df_wide[['Emp_0_1_years', 'Emp_2_3_years', 'Emp_4_5_years']].sum(axis=1)

    # Step 6: Calculating 'emp_qrt_ratio' and 'YF_Emp_Share'
    df_wide['emp_qrt_ratio'] = df_wide['emp_0_5_years'] / df_wide['Emp_all_years']
    df_wide['YF_Emp_Share'] = df_wide['emp_qrt_ratio'] * 100

    # Drop intermediate columns and handle infinite values
    df_wide.replace([float('inf'), float('-inf')], 0, inplace=True)
    df_wide.drop(columns=['emp_qrt_ratio', 'emp_0_5_years'] + expected_columns, inplace=True)

    # Step 7: Aggregating to get annual values
    annual_df = df_wide.groupby('year')['YF_Emp_Share'].mean().reset_index()
    annual_df['Change'] = annual_df['YF_Emp_Share'].diff()

    return annual_df


# You can then apply this function to your multi-year dataset
df = pd.read_csv('./QWI_Data/QWI_USA.csv')
transformed_multi_year_df = transform_data(df)
transformed_multi_year_df

Unnamed: 0,year,YF_Emp_Share,Change
0,2015,10.103135,
1,2016,10.046644,-0.056491
2,2017,10.005462,-0.041182
3,2018,10.06046,0.054998
4,2019,10.244591,0.18413
5,2020,9.870826,-0.373764
6,2021,10.209489,0.338663
7,2022,10.724542,0.515053


In [24]:
df = pd.read_csv('./QWI_Data/QWI_2022.csv')
df_sorted = df.sort_values(by='geography_label.value')

# Save the sorted DataFrame to a new CSV file
df_sorted.to_csv('./QWI_2022_pre_trend.csv', index=False)

In [3]:
import pandas as pd

# Read the Excel file
file_path = "./QWI_Data/QWI_USA.csv"
df = pd.read_csv(file_path)

# Create 'firmage' column based on 'firmage_labelvalue'
df['firmage'] = df['firmage_label.value'].replace({
    '0-1 Years': '_0_1_years',
    '2-3 Years': '_2_3_years',
    '4-5 Years': '_4_5_years',
    'All Firm Ages': '_all_years'
})
# Drop 'firmage_labelvalue' and 'geography'
df.drop(['firmage_label.value', 'geography'], axis=1, inplace=True)
# Reshape the data from long to wide format
df_wide = df.pivot_table(index=['year', 'quarter'], columns='firmage', values='Emp', aggfunc='first')
# Calculate EMP_youngfirm
df_wide['EMP_youngfirm'] = df_wide[['_0_1_years', '_2_3_years', '_4_5_years']].sum(axis=1)
# Calculate YF_emp_share
df_wide['YF_emp_share'] = df_wide['EMP_youngfirm'] / df_wide['_all_years']
# Drop unnecessary columns
df_wide.drop(['_0_1_years', '_2_3_years', '_4_5_years', 'EMP_youngfirm', '_all_years'], axis=1, inplace=True)
# Collapse to get the mean of YF_emp_share by year
df_collapsed = df_wide.groupby('year')['YF_emp_share'].mean().reset_index()
# Set year as a time series index
df_collapsed.set_index('year', inplace=True)
# Generate lagged_emp_share
df_collapsed['lagged_emp_share'] = df_collapsed['YF_emp_share'].shift(1)
# Calculate YoY growth
df_collapsed['trends'] = (df_collapsed['YF_emp_share'] - df_collapsed['lagged_emp_share']) / df_collapsed['lagged_emp_share']
df_collapsed


Unnamed: 0_level_0,YF_emp_share,lagged_emp_share,YoY_growth
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015,0.106459,,
2016,0.105837,0.106459,-0.005847
2017,0.105545,0.105837,-0.002757
2018,0.106172,0.105545,0.005942
2019,0.107889,0.106172,0.016174
2020,0.103811,0.107889,-0.037803
2021,0.107762,0.103811,0.038065
2022,0.112608,0.107762,0.044962


In [5]:
# Read the Excel file
file_path = "./QWI_Data/QWI_USA.csv"
df = pd.read_csv(file_path)

# Create 'firmage' column based on 'firmage_labelvalue'
df['firmage'] = df['firmage_label.value'].replace({
    '0-1 Years': '_0_1_years',
    '2-3 Years': '_2_3_years',
    '4-5 Years': '_4_5_years',
    'All Firm Ages': '_all_years'
})
# Create 'edu_level' column
df['edu_level'] = df['education_label.value'].replace({
    "Bachelor's degree or advanced degree": '_bachelors',
    'Less than high school': '_less_high',
    'High school or equivalent, no college': '_high_school',
    'Some college or Associate degree': '_associate'
})
df = df[df['education_label.value'] != "All Education Categories"]
# Create 'firmage' column
df.drop(['education_label.value', 'firmage_label.value', 'geography'], axis=1, inplace=True)
# Filter out '_all_years' from 'firmage'
df = df[df['firmage'] != '_all_years']
# Reshape the data from long to wide format
df_wide = df.pivot_table(index=['year', 'quarter', 'firmage'], columns='edu_level', values='Emp', aggfunc='first')
# Sum the employment by education level
df_wide['Emp_all'] = df_wide.sum(axis=1)
# Calculate knowledge_intensity_ratio
df_wide['knowledge_intensity_ratio'] = df_wide['_bachelors'] / df_wide['Emp_all']
# Collapse to get the mean of knowledge_intensity_ratio by year
df_collapsed = df_wide.groupby('year')['knowledge_intensity_ratio'].mean().reset_index()
# Set year as a time series index
df_collapsed.set_index('year', inplace=True)
# Generate lagged_intensity_ratio
df_collapsed['lagged_intensity_ratio'] = df_collapsed['knowledge_intensity_ratio'].shift(1)
# Calculate YoY growth
df_collapsed['YoY_growth'] = (df_collapsed['knowledge_intensity_ratio'] - df_collapsed['lagged_intensity_ratio']) / df_collapsed['lagged_intensity_ratio']
df_collapsed

Unnamed: 0_level_0,knowledge_intensity_ratio,lagged_intensity_ratio,YoY_growth
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015,0.249804,,
2016,0.24965,0.249804,-0.000619
2017,0.249856,0.24965,0.000828
2018,0.250837,0.249856,0.003924
2019,0.253263,0.250837,0.009672
2020,0.257616,0.253263,0.017187
2021,0.261587,0.257616,0.015415
2022,0.261916,0.261587,0.001256


In [9]:
df = pd.read_csv('./BLS_Data/Employment_Data_Monthly_Annual.csv', low_memory=False, index_col=False)
last_col = df.columns[-1]
annual_cols = ['series_id'] + [col for col in df.columns if "Annual" in col] + [last_col]
df = df[annual_cols]
df

Unnamed: 0,series_id,2017-Annual,2018-Annual,2019-Annual,2020-Annual,2021-Annual,2022-Annual,2023-June
0,ENU0100110010,10822.666667,10973.666667,11106.500000,10794.333333,11122.750000,11535.166667,11838
1,ENU0100310010,71998.916667,74051.416667,76418.833333,73087.333333,76546.000000,79972.333333,85444
2,ENU0100510010,8011.666667,8113.333333,8239.166667,7935.250000,7719.166667,7822.833333,7696
3,ENU0100710010,4109.750000,4215.416667,4602.333333,4653.250000,4755.750000,4835.833333,4812
4,ENU0100910010,8327.416667,8535.166667,8712.333333,8380.416667,8495.833333,8725.583333,8986
...,...,...,...,...,...,...,...,...
3136,ENU5603710010,22214.416667,22275.166667,22292.500000,20120.583333,19796.416667,20342.250000,21074
3137,ENU5603910010,20783.583333,20959.250000,21429.083333,19106.333333,21133.583333,22485.583333,25861
3138,ENU5604110010,8174.583333,8218.500000,8291.583333,7847.333333,7876.583333,8086.250000,8353
3139,ENU5604310010,3629.916667,3617.583333,3517.833333,3467.083333,3550.250000,3510.916667,3692


In [None]:
// 