<a href="https://colab.research.google.com/github/wmok12/Thesis-research-minimumwage-employment/blob/main/EDA_construction_industry.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Packages and loading data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

In [None]:
# Load your dataset
dataset = pd.read_csv('/content/drive/MyDrive/Thesis/Voorbereiding/dataset_cps_construction.csv')
print(dataset)

In [None]:
# Extract features in datafram
features_dataset = pd.DataFrame(dataset.columns.drop(['YEAR', 'MONTH']), columns=['Features'])

# Display table
print(features_dataset)

# Statistics

In [None]:
print(dataset['AGE'].describe())
print("Skewness:\n", dataset['AGE'].skew())
print("Kurtosis:\n", dataset['AGE'].kurtosis())


In [None]:
print(dataset['MINIMUM WAGE'].describe())
print("Skewness:\n", dataset['MINIMUM WAGE'].skew())
print("Kurtosis:\n", dataset['MINIMUM WAGE'].kurtosis())

In [None]:
# Chi squared test
def perform_chi_square(variable, target):
    contingency_table = pd.crosstab(dataset[variable], dataset[target])
    chi_squared_statistic, p_value, dof, expected = stats.chi2_contingency(contingency_table)

    print(f"\nChi-squared test for {variable} vs {target}:")
    print(f"Chi-squared statistic: {chi_squared_statistic:.2f}")
    print(f"P-value: {p_value:.4f}")
    print(f"Degrees of freedom: {dof}")

# The chi-square statistic for each categorical variable compared to the target variable
categorical_vars = ['EDUC', 'SEX', 'CLASSWKR', 'MARST', 'RACE', 'LABFORCE']

for var in categorical_vars:
    perform_chi_square(var, 'EMPSTAT')

# Distribution & outliers

In [None]:
# Distribution & outliers age

plt.figure(figsize=(10, 5))
plt.boxplot(dataset['AGE'])
plt.title('Distribution & outliers age', fontsize = 22)
plt.xlabel('Age', fontsize = 18)
plt.ylabel('Count', fontsize = 18)
plt.xticks([])
plt.yticks(fontsize=16)
plt.show()


In [None]:
# Distribution & outliers Minimum Wage

plt.figure(figsize=(10, 5))
plt.boxplot(dataset['MINIMUM WAGE'])
plt.title('Distribution & Outliers Minimum Wage', fontsize=22)
plt.xlabel('Minimum Wage', fontsize=18)
plt.ylabel('Count', fontsize=18)
plt.xticks([])
plt.yticks(fontsize=16)
plt.show()


In [None]:
# Countplot Employment status

# Plotting
plt.figure(figsize=(10, 5))
sns.countplot(data=dataset, x='EMPSTAT', color = '#ADD8E6')
plt.title('Number of observations per employment status', fontsize = 22)
plt.xlabel('Employment status', fontsize = 18)
plt.ylabel('Count', fontsize = 18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.show()

In [None]:
# Countplot Education

# Education counts
educ_counts = dataset['EDUC'].value_counts().sort_values(ascending = False)
sorted_educ = educ_counts.index

# Plotting
plt.figure(figsize=(10, 5))
sns.countplot(data=dataset, x='EDUC', order = sorted_educ, color = '#ADD8E6')
plt.title('Number of observations per educational level', fontsize = 22)
plt.xlabel('Education', fontsize = 18)
plt.ylabel('Count', fontsize = 18)
plt.yticks(fontsize=16)
plt.xticks(rotation=45, ha = 'right', fontsize = 16)
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.show()


In [None]:
# Countplot Sex

# Plotting
plt.figure(figsize=(10, 5))
sns.countplot(data=dataset, x='SEX', color = '#ADD8E6')  # Gebruik countplot voor categorische data
plt.title('Number of observations per gender', fontsize = 22)
plt.xlabel('Gender', fontsize = 18)
plt.ylabel('Count', fontsize = 18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.grid(axis='y', linestyle='--', alpha=0.5)  # Voeg een grid toe
plt.show()

In [None]:
# Countplot class of worker

# Plotting
plt.figure(figsize=(10, 5))
sns.countplot(data=dataset, x='CLASSWKR', color = '#ADD8E6')  # Gebruik countplot voor categorische data
plt.title('Number of observations per class of worker', fontsize = 22)
plt.xlabel('Class of worker', fontsize = 18)
plt.ylabel('Count', fontsize = 18)
plt.xticks(fontsize = 16)
plt.yticks(fontsize=16)
plt.grid(axis='y', linestyle='--', alpha=0.5)  # Voeg een grid toe
plt.show()

In [None]:
# Countplot Marital Status

# Marital status counts
marst_counts = dataset['MARST'].value_counts().sort_values(ascending = False)
sorted_marst = marst_counts.index

# Plotting
plt.figure(figsize=(10, 5))
sns.countplot(data=dataset, x='MARST', order = sorted_marst, color = '#ADD8E6')  # Gebruik countplot voor categorische data
plt.title('Number of observations per marital status', fontsize = 22)
plt.xlabel('Marital status', fontsize = 18)
plt.ylabel('Count', fontsize =18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.grid(axis='y', linestyle='--', alpha=0.5)  # Voeg een grid toe
plt.show()

In [None]:
# Countplot race

# Plotting
plt.figure(figsize=(10, 5))
sns.countplot(data=dataset, x='RACE', color = '#ADD8E6')  # Gebruik countplot voor categorische data
plt.title('Number of observations per race', fontsize = 22)
plt.xlabel('Race', fontsize = 18)
plt.ylabel('Count', fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.grid(axis='y', linestyle='--', alpha=0.5)  # Voeg een grid toe
plt.show()

In [None]:
# Countplot labor force status

# Plotting
plt.figure(figsize=(10, 5))
sns.countplot(data=dataset, x='LABFORCE', color = '#ADD8E6')  # Gebruik countplot voor categorische data
plt.title('Number of observations per labor force status', fontsize = 22)
plt.xlabel('Labor force status', fontsize=18)
plt.ylabel('Count', fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.grid(axis='y', linestyle='--', alpha=0.5)  # Voeg een grid toe
plt.show()