In [53]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

def clean_experience(x):
    """Helper function to clean years of experience values"""
    if x == 'unknown':
        return x  # Keep unknown as its own category
    elif x == '<1':
        return 0
    elif x == '>20':
        return 21
    else:
        return float(x)

def preprocess_data(df, 
                   columns_to_encode,
                   columns_to_standardize,
                   n_most_frequent_counties=None,
                   experience_bins=None):
    """
    Preprocesses the data according to specified rules.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Input dataframe to preprocess
    columns_to_encode : list
        List of column names to one-hot encode
    columns_to_standardize : list
        List of column names to standardize
    n_most_frequent_counties : int, optional
        Number of most frequent counties to keep, rest will be grouped as 'Other'
        If None, no grouping is performed
    experience_bins : dict, optional
        Dictionary mapping experience ranges to categories
        Example: {(0, 5): 'Junior', (5, 10): 'Mid', (10, float('inf')): 'Senior'}
        
    Returns:
    --------
    pandas.DataFrame
        Preprocessed dataframe
    dict
        Dictionary containing the fitted StandardScaler for each standardized column
    dict
        Dictionary containing preprocessing metadata
    """
    # Create a copy to avoid modifying the original dataframe
    df_processed = df.copy()
    
    # Initialize metadata dictionary
    metadata = {
        'top_counties': None,
        'experience_bins': experience_bins,
        'categorical_columns': {},
        'binary_columns': [],
        'final_columns': [],  # Store final column order
        'encoded_columns': {} # Store encoded column names for each original column
    }
    
    # Handle county grouping if specified
    if n_most_frequent_counties is not None:
        county_counts = df_processed['county'].value_counts()
        metadata['top_counties'] = county_counts.head(n_most_frequent_counties).index.tolist()
        df_processed['county'] = df_processed['county'].apply(
            lambda x: x if x in metadata['top_counties'] else 'Other'
        )
    
    # Clean years of experience
    df_processed['years_of_experience_numeric'] = df_processed['years_of_experience'].apply(clean_experience)
    
    # Handle experience binning if specified
    if experience_bins:
        df_processed['years_of_experience'] = df_processed['years_of_experience_numeric'].apply(
            lambda x: x if x == 'unknown' else next(
                (category for (low, high), category in experience_bins.items() 
                 if low <= x < high),
                'Other'  # Default category if no range matches
            )
        )
    
    # One-hot encoding
    for col in columns_to_encode:

            
        # Store unique categories for each categorical column
        unique_categories = sorted(df_processed[col].unique())  # Sort for consistency
        metadata['categorical_columns'][col] = unique_categories
        
        # Get dummies and add prefix to avoid column name conflicts
        dummies = pd.get_dummies(df_processed[col], prefix=col,drop_first=True)
        
        # Store the encoded column names for this original column
        metadata['encoded_columns'][col] = [f"{col}_{category}" for category in unique_categories]
        
        # Drop the original column and join the dummy variables
        df_processed = df_processed.drop(col, axis=1)
        df_processed = pd.concat([df_processed, dummies], axis=1)
    
    # Standardization
    scalers = {}
    for col in columns_to_standardize:
        # Handle years_of_experience specially
        if col == 'years_of_experience':
            col = 'years_of_experience_numeric'
            
            # Only standardize non-unknown values
            mask = df_processed[col] != 'unknown'
            numeric_values = df_processed.loc[mask, col].astype(float).values.reshape(-1, 1)
            
            scaler = StandardScaler()
            scaler.fit(numeric_values)
            
            # Transform only numeric values, keep 'unknown' as is
            df_processed.loc[mask, col] = scaler.transform(numeric_values)
            
        else:
            scaler = StandardScaler()
            df_processed[col] = scaler.fit_transform(df_processed[[col]])
            
        scalers[col] = scaler
    
    # Clean up intermediate columns if not needed
    if 'years_of_experience_numeric' in df_processed.columns and \
       'years_of_experience_numeric' not in columns_to_standardize:
        df_processed = df_processed.drop('years_of_experience_numeric', axis=1)
    
    # Store final column order
    metadata['final_columns'] = df_processed.columns.tolist()
    
    return df_processed, scalers, metadata

def transform_new_data(df, 
                      metadata,
                      columns_to_encode,
                      scalers):
    """
    Transforms new data using the same preprocessing steps and fitted scalers.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        New data to transform
    metadata : dict
        Dictionary containing preprocessing metadata
    columns_to_encode : list
        List of column names to one-hot encode
    scalers : dict
        Dictionary of fitted StandardScaler objects
        
    Returns:
    --------
    pandas.DataFrame
        Transformed dataframe
    """
    # Create a copy to avoid modifying the original dataframe
    df_transformed = df.copy()
    
    # Handle county grouping if it was used in training
    if metadata['top_counties'] is not None:
        df_transformed['county'] = df_transformed['county'].apply(
            lambda x: x if x in metadata['top_counties'] else 'Other'
        )
    
    # Clean years of experience
    df_transformed['years_of_experience_numeric'] = df_transformed['years_of_experience'].apply(clean_experience)
    
    # Handle experience binning if it was used in training
    if metadata['experience_bins']:
        df_transformed['years_of_experience'] = df_transformed['years_of_experience_numeric'].apply(
            lambda x: x if x == 'unknown' else next(
                (category for (low, high), category in metadata['experience_bins'].items() 
                 if low <= x < high),
                'Other'  # Default category if no range matches
            )
        )
    
    # Initialize a dictionary to store all transformed columns
    transformed_columns = {}
    
    # One-hot encoding
    for col in columns_to_encode:
        if col in metadata['binary_columns']:
            transformed_columns[col] = df_transformed[col].apply(lambda x: 1 if x=='Yes' else 0)
            continue
            
        # Get encoded columns from metadata
        encoded_cols = metadata['encoded_columns'].get(col, [])
        
        # Create dummy variables
        dummies = pd.get_dummies(df_transformed[col], prefix=col)
        
        # Ensure all training categories are present
        for dummy_col in encoded_cols:
            if dummy_col not in dummies.columns:
                dummies[dummy_col] = 0
        
        # Only keep the columns that were present in training
        dummies = dummies[encoded_cols]
        
        # Add to transformed columns
        for col_name in dummies.columns:
            transformed_columns[col_name] = dummies[col_name]
    
    # Standardization
    for col, scaler in scalers.items():
        # Handle years_of_experience specially
        if col == 'years_of_experience':
            col = 'years_of_experience_numeric'
            
            # Only standardize non-unknown values
            mask = df_transformed[col] != 'unknown'
            numeric_values = df_transformed.loc[mask, col].astype(float).values.reshape(-1, 1)
            
            # Create a new series with the same index as the original
            transformed_col = df_transformed[col].copy()
            transformed_col.loc[mask] = scaler.transform(numeric_values).flatten()
            transformed_columns[col] = transformed_col
            
        else:
            transformed_columns[col] = scaler.transform(df_transformed[[col]]).flatten()
    
    # Create final dataframe with correct column order
    final_df = pd.DataFrame(transformed_columns)
    
    # Ensure all columns from training are present
    for col in metadata['final_columns']:
        if col not in final_df.columns:
            final_df[col] = 0  # Add missing columns with zeros
    
    # Return dataframe with same columns in same order as training
    return final_df[metadata['final_columns']]

In [54]:
training_data = pd.read_csv('job_change_train.csv')
testing_data = pd.read_csv('job_change_test.csv')

In [55]:
# Define preprocessing parameters
columns_to_encode = ['gender', 'education', 'field_of_studies','is_studying','county','years_since_job_change','years_of_experience','size_of_company','type_of_company']
columns_to_standardize = ['age', 'relative_wage','hours_of_training']

# Optional parameters
n_most_frequent_counties = 10  # Set to None to keep all counties
experience_bins = {
    (0, 3): '0-3',
    (3, 7): '3-7',
    (7, 15): '7-15',
    (15, float('inf')): '15+'
}


labels = training_data['willing_to_change_job']
labels = labels.apply(lambda x: 1 if x=='Yes' else 0)
training_data.drop(['id','willing_to_change_job'],inplace=True,axis=1)
# Preprocess training data
training_data_processed, scalers, metadata  = preprocess_data(
    training_data,
    columns_to_encode= columns_to_encode,
    columns_to_standardize = columns_to_standardize,
    n_most_frequent_counties=n_most_frequent_counties,  # Optional
    experience_bins=experience_bins  # Optional
)
training_data_processed


Unnamed: 0,age,relative_wage,hours_of_training,is_certified,gender_Male,gender_Other,gender_Unknown,education_higher_BSc,education_higher_MSc,education_no_information,...,size_of_company_500-999,size_of_company_5000-9999,size_of_company_<10,size_of_company_unknown,type_of_company_no_information,type_of_company_other,type_of_company_private_company,type_of_company_public_sector,type_of_company_startup_with_funding,type_of_company_startup_wo_funding
0,-1.328185,0.741406,-0.779442,1,True,False,False,True,False,False,...,False,False,True,False,False,False,True,False,False,False
1,-0.715787,0.217798,-0.680106,1,True,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
2,0.631489,-1.474594,0.511924,1,True,False,False,True,False,False,...,False,False,False,False,False,False,False,False,True,False
3,1.733806,-1.047512,-0.481434,1,False,False,True,True,False,False,...,False,False,False,False,False,False,True,False,False,False
4,-0.593307,0.741406,0.677484,1,True,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12422,-0.715787,-2.199905,-0.514546,0,True,False,False,False,False,False,...,False,False,False,True,True,False,False,False,False,False
12423,-0.225868,-1.643970,0.843044,1,True,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
12424,1.733806,0.741406,1.306611,1,True,False,False,False,False,False,...,False,False,True,False,False,False,True,False,False,False
12425,-1.083226,-1.643970,0.578148,1,True,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False


In [56]:
labels

0        0
1        0
2        0
3        0
4        0
        ..
12422    1
12423    1
12424    0
12425    0
12426    0
Name: willing_to_change_job, Length: 12427, dtype: int64

In [57]:
from sklearn.linear_model import LogisticRegression

X_train = training_data_processed
y_train = labels

final_model = LogisticRegression(
    max_iter=1000,
    penalty='l2',
    class_weight='balanced',
    random_state=42
)

# Fit the model
final_model.fit(X_train, y_train)

In [58]:
testing_data_processed = transform_new_data(
    testing_data,
    metadata=metadata,
    columns_to_encode=columns_to_encode,
    scalers=scalers
)
testing_data_processed

Unnamed: 0,age,relative_wage,hours_of_training,is_certified,gender_Male,gender_Other,gender_Unknown,education_higher_BSc,education_higher_MSc,education_no_information,...,size_of_company_500-999,size_of_company_5000-9999,size_of_company_<10,size_of_company_unknown,type_of_company_no_information,type_of_company_other,type_of_company_private_company,type_of_company_public_sector,type_of_company_startup_with_funding,type_of_company_startup_wo_funding
0,-0.103389,0.894390,-0.713218,0,True,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
1,0.019091,0.297932,1.439059,0,True,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
2,0.386530,-1.643970,-0.696662,0,True,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
3,-0.348348,0.741406,-0.729774,0,True,False,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False
4,0.019091,0.789669,-0.829110,0,True,False,False,False,True,False,...,True,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3303,-1.083226,0.741406,-0.961557,0,True,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
3304,-0.103389,0.789669,-0.332430,0,True,False,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
3305,0.264050,0.741406,-0.464878,0,False,False,True,True,False,False,...,False,False,False,False,False,False,True,False,False,False
3306,0.753969,0.741406,0.081469,0,False,False,True,True,False,False,...,False,False,False,True,True,False,False,False,False,False


In [59]:
predictions = final_model.predict(testing_data_processed)

# Create submission dataframe
submission_df = pd.DataFrame({
    'id': testing_data['id'],
    'willing_to_change_job': predictions
})

In [60]:
submission_df.to_csv('submission.csv', index=False)

In [61]:
submission_df

Unnamed: 0,id,willing_to_change_job
0,12428,0
1,12429,0
2,12430,1
3,12431,0
4,12432,0
...,...,...
3303,15731,0
3304,15732,0
3305,15733,0
3306,15734,1
