# Data 

## Necessary packages and libraries

In [30]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import joblib

## Load Selects cumulative data

In [31]:
# Define the file path
file_path = '../data/selects_cumulative_1971-2019.dta'

# Load the Stata file
data = pd.read_stata(file_path)

In [32]:
# Save the dataframe to a CSV file
csv_file_path = "../data/selects_cumulative_1971-2019.csv"
data.to_csv(csv_file_path, index=False)

print(f"Data saved to {csv_file_path}")

Data saved to ../data/selects_cumulative_1971-2019.csv


In [33]:
data.head()

Unnamed: 0,year,userid,useridpy,sex,age,maritals,educ,income_sfr,income,income_hh,...,intinf2,intinf3,intinf4,intinf5,intinf6,intinf7,weightc,weightst,weightp,weighttot
0,1971,1,3.0,male,33.0,married,primary school,,,,...,2.0,1972.0,70.0,wife/husband,never,0.666667,1.0,0.936,1.0,0.936
1,1971,2,5.0,male,56.0,married,compulsory education,,,,...,6.0,1972.0,50.0,no one,,0.666667,1.0,0.936,1.0,0.936
2,1971,3,6.0,male,49.0,married,primary school,,,,...,2.0,1972.0,60.0,no one,,not interested,1.0,1.0,1.0,1.0
3,1971,4,9.0,female,60.0,married,primary school,,,,...,6.0,1972.0,55.0,no one,,0.666667,1.0,1.099,1.0,1.099
4,1971,5,10.0,female,39.0,married,compulsory education,,,,...,2.0,1972.0,65.0,wife/husband,0.666667,0.333333,1.0,0.936,1.3648,1.277453


## Selecting chosen variables

In [34]:
# Loading original data before subsetting
subset_source = pd.read_csv("../data/selects_cumulative_1971-2019.csv")

  subset_source = pd.read_csv("../data/selects_cumulative_1971-2019.csv")


In [35]:
# Create a subset with selected columns
selected_columns = ['year', 'userid', 'sex', 'age', 'educ', 'income',
                    'religion','sg1', 'sg9', 'sc1', 'sc7a', 'sc7b',
                    'pi1', 'lr1', 'pm3',
                    'vp1', 'vdn1b', 'pid1', 'pid2b', 'trust1',
                    'weighttot']  
subset = subset_source.loc[:, selected_columns]
subset.head()

Unnamed: 0,year,userid,sex,age,educ,income,religion,sg1,sg9,sc1,...,sc7b,pi1,lr1,pm3,vp1,vdn1b,pid1,pid2b,trust1,weighttot
0,1971,1,male,33.0,primary school,,cath,French spoken,,in training/formation,...,service class employees,rather not interested,,materialist,yes,,yes,sps/pss,,0.936
1,1971,2,male,56.0,compulsory education,,other,French spoken,,full-time,...,other self-employed,rather not interested,,mixed postmat,yes,,yes,sps/pss,6.666666507720947,0.936
2,1971,3,male,49.0,primary school,,cath,French spoken,,full-time,...,semiskilled and unskilled workers,rather not interested,,mixed mat,,,no,no party identification,6.666666507720947,1.0
3,1971,4,female,60.0,primary school,,prot,French spoken,,in household,...,,not interested at all,,mixed mat,no,,no,no party identification,,1.099
4,1971,5,female,39.0,compulsory education,,prot,French spoken,,without profession,...,service class employees,rather not interested,,mixed mat,yes,fdp/prd,no,no party identification,3.3333332538604736,1.277453


## Preprocessing data

In [36]:
# Replace "NaN" with "0" for both numeric and categorical data
subset_clean = subset.fillna(0)

# Further select most important variables
column_drop = ["year", "userid", "lr1", "pid2b", "sc7a", "weighttot", "trust1"]
subset_clean = subset_clean.drop(column_drop, axis=1)

subset_clean.head()

Unnamed: 0,sex,age,educ,income,religion,sg1,sg9,sc1,sc7b,pi1,pm3,vp1,vdn1b,pid1
0,male,33.0,primary school,0,cath,French spoken,0,in training/formation,service class employees,rather not interested,materialist,yes,0,yes
1,male,56.0,compulsory education,0,other,French spoken,0,full-time,other self-employed,rather not interested,mixed postmat,yes,0,yes
2,male,49.0,primary school,0,cath,French spoken,0,full-time,semiskilled and unskilled workers,rather not interested,mixed mat,0,0,no
3,female,60.0,primary school,0,prot,French spoken,0,in household,0,not interested at all,mixed mat,no,0,no
4,female,39.0,compulsory education,0,prot,French spoken,0,without profession,service class employees,rather not interested,mixed mat,yes,fdp/prd,no


In [37]:
# Dictionary to rename categories
category_rename_mapping = {
    0: 'unknown',  # Example for numeric 0
    'GLP/Vert\'libéraux': "GLP",
    'bdp': 'BDP',
    'centre parties': 'Centre Parties',
    'csp/pcs': 'CSP',
    'cvp/pdc': 'CVP',
    'edu/udf': 'EDU',
    'evp/pep': 'EVP',
    'fdp/prd': 'FDP',
    'fga/avf': 'FGA',
    'fps/psl': 'FPS',
    'gps/pes': 'GPS',
    'ldu/adi': 'LdU',
    'left parties': 'Left Parties',
    'lega': 'Lega',
    'lps/pls': 'LPS',
    'mcg': 'MCG',
    'other comments': 'Other Comments',
    'other parties': 'Other Parties',
    'pda/pdt': 'PdA',
    'poch': 'POCH',
    'psa (psu)': 'PSA',
    'rep. (& vigil.)': 'Rep',
    'right parties': 'Right Parties',
    'sd/ds': 'SD',
    'sol.': 'Sol',
    'sps/pss': 'SP',
    'svp/udc': 'SVP',
    'voted blank': 'Voted Blank'
}

In [38]:
# Convert vdn1b to a categorical text variable
subset_clean['vdn1b'] = subset_clean['vdn1b'].astype('category')

# Rename categories
subset_clean['vdn1b'] = subset_clean['vdn1b'].cat.rename_categories(category_rename_mapping)

# Verify the renaming
print("Renamed categories:", subset_clean['vdn1b'].cat.categories)

# Change LPS to FDP
subset_clean['vdn1b'] = subset_clean['vdn1b'].replace({'LPS': 'FDP'})

Renamed categories: Index(['unknown', 'GLP', 'BDP', 'Centre Parties', 'CSP', 'CVP', 'EDU', 'EVP',
       'FDP', 'FGA', 'FPS', 'GPS', 'LdU', 'Left Parties', 'Lega', 'LPS', 'MCG',
       'Other Comments', 'Other Parties', 'PdA', 'POCH', 'PSA', 'Rep',
       'Right Parties', 'SD', 'Sol', 'SP', 'SVP', 'Voted Blank'],
      dtype='object')


  subset_clean['vdn1b'] = subset_clean['vdn1b'].replace({'LPS': 'FDP'})


In [39]:
subset_clean['vdn1b'].unique()

['unknown', 'FDP', 'PdA', 'SP', 'Other Parties', ..., 'PSA', 'Centre Parties', 'BDP', 'GLP', 'MCG']
Length: 28
Categories (28, object): ['unknown', 'GLP', 'BDP', 'Centre Parties', ..., 'Sol', 'SP', 'SVP', 'Voted Blank']

In [40]:
# Select most important existing parties
subset_selected_parties = subset_clean[subset_clean['vdn1b'].isin(['FDP','CVP','SP','SVP','EVP',
                                                        'GPS', 'GLP'])]

subset_selected_parties

Unnamed: 0,sex,age,educ,income,religion,sg1,sg9,sc1,sc7b,pi1,pm3,vp1,vdn1b,pid1
4,female,39.0,compulsory education,0,prot,French spoken,0,without profession,service class employees,rather not interested,mixed mat,yes,FDP,no
9,female,43.0,compulsory education,0,other,French spoken,0,full-time,others,not interested at all,materialist,yes,SP,no
12,female,33.0,primary school,0,cath,French spoken,0,full-time,semiskilled and unskilled workers,not interested at all,mixed mat,yes,SP,yes
23,male,78.0,primary school,0,prot,French spoken,0,retired,skilled workers/foremen,not interested at all,mixed mat,yes,SP,no
27,male,76.0,primary school,0,cath,French spoken,0,retired,skilled workers/foremen,not interested at all,materialist,yes,SP,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43660,male,28.0,compulsory education,low income,none,French spoken,urban,other,0,very interested,0,yes,GLP,no
43661,male,43.0,vocational college,rather high income,none,German spoken,rural,part-time,0,rather interested,0,yes,FDP,no
43663,female,52.0,vocational education,rather low income,none,German spoken,rural,part-time,0,rather interested,0,yes,SP,no
43665,female,42.0,university,rather high income,prot,German spoken,urban,part-time,0,rather interested,0,yes,GLP,yes


In [41]:
# Identify categorical columns
categorical_cols = subset_selected_parties.select_dtypes(include=['object']).columns
categorical_cols = [col for col in categorical_cols if col != 'vdn1b']

# Encode categorical columns using LabelEncoder
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    subset_selected_parties[col] = le.fit_transform(subset_selected_parties[col].astype(str))
    label_encoders[col] = le
    
    # Print encoding mapping for each column
    print(f"Encoding mapping for {col}:")
    print(dict(zip(le.classes_, le.transform(le.classes_))))
    print()  # Add a blank line for readability

# Save the encoders for later use
joblib.dump(label_encoders, '../data/models/label_encoders.pkl')

# Verify the conversion of vdn1b
print("vdn1b dtype:", subset_selected_parties['vdn1b'].dtype)  # Should show 'category'

Encoding mapping for sex:
{'female': np.int64(0), 'male': np.int64(1)}

Encoding mapping for educ:
{'0': np.int64(0), 'basic vocational training': np.int64(1), 'compulsory education': np.int64(2), 'diploma school': np.int64(3), 'high school': np.int64(4), 'higher vocational training': np.int64(5), 'primary school': np.int64(6), 'university': np.int64(7), 'vocational college': np.int64(8), 'vocational education': np.int64(9)}

Encoding mapping for income:
{' rather high income': np.int64(0), ' rather low income': np.int64(1), '0': np.int64(2), 'high income': np.int64(3), 'low income': np.int64(4), 'medium income': np.int64(5)}

Encoding mapping for religion:
{'0': np.int64(0), 'cath': np.int64(1), 'none': np.int64(2), 'other': np.int64(3), 'prot': np.int64(4)}

Encoding mapping for sg1:
{'0': np.int64(0), 'French spoken': np.int64(1), 'German spoken': np.int64(2), 'Italian spoken': np.int64(3)}

Encoding mapping for sg9:
{'0': np.int64(0), 'rural': np.int64(1), 'urban': np.int64(2)}

En

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_selected_parties[col] = le.fit_transform(subset_selected_parties[col].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_selected_parties[col] = le.fit_transform(subset_selected_parties[col].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_selected_parties[c

## Save subset with 8 most important parties

In [42]:
# save subset data
subset_selected_parties.to_csv('../data/subset_selected_parties_model.csv', index=False)