### Daten laden und Zusammenführen
In diesem Schritt laden wir die Daten aus den Pickle-Dateien und führen die DataFrames zusammen.


In [1]:
import pandas as pd

# Daten laden
df_occupation = pd.read_pickle('./pickle/naics_occupation.pkl')
df_pattern = pd.read_pickle('./pickle/naics_pattern.pkl')

# Daten zusammenführen
merged_df = pd.merge(df_occupation, df_pattern, on=['FIPS', 'naics'], how='inner')


### Überprüfung der Überschneidungen:

In [2]:
merged_df.head()

Unnamed: 0,FIPS,State_GEOID_x,naics,NAICS_TITLE,emp_total_county_naics,OCC_CODE,OCC_TITLE,emp_occupation,State_GEOID_y,County_GEOID,...,n20_49,n50_99,n100_249,n250_499,n500_999,n1000,n1000_1,n1000_2,n1000_3,n1000_4
0,13073,13,2373,"Highway, Street, and Bridge Construction",27,11-3051,Industrial Production Managers,0.022497,13,73,...,N,N,N,N,N,N,N,N,N,N
1,13073,13,2381,"Foundation, Structure, and Building Exterior C...",231,11-3051,Industrial Production Managers,0.06749,13,73,...,4,N,N,N,N,N,N,N,N,N
2,13073,13,2382,Building Equipment Contractors,868,11-3051,Industrial Production Managers,0.134981,13,73,...,10,N,N,N,N,N,N,N,N,N
3,13073,13,2383,Building Finishing Contractors,281,11-3051,Industrial Production Managers,0.078739,13,73,...,6,N,N,N,N,N,N,N,N,N
4,13073,13,3211,Sawmills and Wood Preservation,116,11-3051,Industrial Production Managers,0.978609,13,73,...,N,N,N,N,N,N,N,N,N,N


### Umwandlung von Zeichenfolgen in numerische Werte
Die Unternehmensgrößenklassen-Spalten enthalten 'N' für 'Nicht verfügbar'. Diese werden in 0 umgewandelt, und die restlichen Zeichenfolgen werden in Ganzzahlen umgewandelt.


In [3]:
# Daten zusammenführen
merged_df = pd.merge(df_occupation, df_pattern, on=['FIPS', 'naics'], how='inner')

# Funktion zur Umwandlung der Zeichenfolgen in numerische Werte
def convert_to_numeric(series):
    return series.replace('N', 0).astype(int)

# Umwandlung der Zeichenfolgen in den Unternehmensgrößenklassen in numerische Werte
size_columns = ['n<5', 'n5_9', 'n10_19', 'n20_49', 'n50_99', 'n100_249', 'n250_499', 'n500_999', 'n1000', 'n1000_1', 'n1000_2', 'n1000_3', 'n1000_4']
for col in size_columns:
    merged_df[col] = convert_to_numeric(merged_df[col])

### Festlegen der Aggregationsmethoden
Für jede numerische Spalte legen wir fest, wie die Werte aggregiert werden sollen. Summen werden für absolute Zahlen wie Beschäftigungszahlen und Unternehmensgrößenklassen berechnet.

In [4]:
# Festlegen der Aggregationsmethoden
agg_methods = {
    'emp_total_county_naics': 'sum',
    'emp_occupation': 'sum',
    'emp': 'sum',
    'qp1': 'sum',
    'ap': 'sum',
    'est': 'sum',
    'n<5': 'sum',
    'n5_9': 'sum',
    'n10_19': 'sum',
    'n20_49': 'sum',
    'n50_99': 'sum',
    'n100_249': 'sum',
    'n250_499': 'sum',
    'n500_999': 'sum',
    'n1000': 'sum',
    'n1000_1': 'sum',
    'n1000_2': 'sum',
    'n1000_3': 'sum',
    'n1000_4': 'sum',
}

### Aggregation der Daten
Die Daten werden nach dem FIPS-Code gruppiert und gemäß den festgelegten Aggregationsmethoden aggregiert.

In [5]:
# Aggregation der Daten nach FIPS
aggregated_df = merged_df.groupby('FIPS').agg(agg_methods).reset_index()
aggregated_df.head()

Unnamed: 0,FIPS,emp_total_county_naics,emp_occupation,emp,qp1,ap,est,n<5,n5_9,n10_19,n20_49,n50_99,n100_249,n250_499,n500_999,n1000,n1000_1,n1000_2,n1000_3,n1000_4
0,1001,23060,288.62513,23060,293275,1278447,2806,1363,482,130,32,0,0,0,0,0,0,0,0,0
1,1003,336985,3596.387783,336985,4080329,18026558,25490,13502,4200,2950,1366,241,21,0,0,0,0,0,0,0
2,1005,25322,312.613142,25322,261186,980777,1087,404,40,0,0,0,0,0,0,0,0,0,0,0
3,1007,19611,224.743546,19611,299619,1400868,831,280,63,0,0,54,0,0,0,0,0,0,0,0
4,1009,32406,392.08067,32406,360149,1640600,4143,2401,300,267,117,0,0,0,0,0,0,0,0,0


### Speichern als Pickle

In [6]:
aggregated_df.to_pickle('./pickle/aggregated_df.pkl')

### Standardisierung

In [8]:
from sklearn.preprocessing import StandardScaler

# Auswahl der numerischen Spalten für das Clustering
numeric_columns = aggregated_df.select_dtypes(include='number').columns

# Standardisieren der Daten
scaler = StandardScaler()
scaled_data = scaler.fit_transform(aggregated_df[numeric_columns])

# Konvertieren der standardisierten Daten zurück in einen DataFrame
scaled_df = pd.DataFrame(scaled_data, columns=numeric_columns)

# Fügen Sie die FIPS-Spalte wieder hinzu, wenn Sie sie für die Identifizierung der Bezirke benötigen
scaled_df['FIPS'] = aggregated_df['FIPS']

# Ausgabe der standardisierten Daten
scaled_df.head()


Unnamed: 0,FIPS,emp_total_county_naics,emp_occupation,emp,qp1,ap,est,n<5,n5_9,n10_19,n20_49,n50_99,n100_249,n250_499,n500_999,n1000,n1000_1,n1000_2,n1000_3,n1000_4
0,1001,-0.154038,-0.249458,-0.144999,-0.135615,-0.136299,-0.225959,-0.218393,-0.206697,-0.238091,-0.235106,-0.203745,-0.194395,-0.163454,-0.098823,-0.071687,-0.075711,-0.069097,-0.055293,-0.03795
1,1003,0.0117,0.224755,0.027281,-0.011807,-0.00732,0.419033,0.4158,0.449455,0.447432,0.164861,-0.030829,-0.171729,-0.163454,-0.098823,-0.071687,-0.075711,-0.069097,-0.055293,-0.03795
2,1005,-0.152844,-0.246019,-0.143758,-0.136665,-0.138592,-0.274837,-0.268495,-0.284701,-0.269693,-0.2447,-0.203745,-0.194395,-0.163454,-0.098823,-0.071687,-0.075711,-0.069097,-0.055293,-0.03795
3,1007,-0.155859,-0.258616,-0.146892,-0.135408,-0.135357,-0.282116,-0.274973,-0.280642,-0.269693,-0.2447,-0.165,-0.194395,-0.163454,-0.098823,-0.071687,-0.075711,-0.069097,-0.055293,-0.03795
4,1009,-0.149104,-0.234626,-0.13987,-0.133429,-0.13351,-0.187943,-0.164163,-0.238816,-0.204787,-0.209621,-0.203745,-0.194395,-0.163454,-0.098823,-0.071687,-0.075711,-0.069097,-0.055293,-0.03795


### Normalisierung

In [10]:
from sklearn.preprocessing import MinMaxScaler

# Auswahl der numerischen Spalten für das Clustering
numeric_columns = aggregated_df.select_dtypes(include='number').columns

# Normalisieren der Daten
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(aggregated_df[numeric_columns])

# Konvertieren der normalisierten Daten zurück in einen DataFrame
normalized_df = pd.DataFrame(normalized_data, columns=numeric_columns)

# Fügen Sie die FIPS-Spalte wieder hinzu, wenn Sie sie für die Identifizierung der Bezirke benötigen
normalized_df['FIPS'] = aggregated_df['FIPS']

# Ausgabe der normalisierten Daten
normalized_df.head()


Unnamed: 0,FIPS,emp_total_county_naics,emp_occupation,emp,qp1,ap,est,n<5,n5_9,n10_19,n20_49,n50_99,n100_249,n250_499,n500_999,n1000,n1000_1,n1000_2,n1000_3,n1000_4
0,1001,0.000302,0.001624,0.000302,0.000268,0.000273,0.002974,0.002543,0.003121,0.001215,0.000399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1003,0.004412,0.020233,0.004412,0.003733,0.003849,0.02704,0.025192,0.027199,0.027575,0.017032,0.007369,0.00097,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1005,0.000331,0.001759,0.000331,0.000239,0.000209,0.00115,0.000754,0.000259,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1007,0.000257,0.001264,0.000257,0.000274,0.000299,0.000878,0.000522,0.000408,0.0,0.0,0.001651,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1009,0.000424,0.002206,0.000424,0.00033,0.00035,0.004392,0.00448,0.001943,0.002496,0.001459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
