In [31]:
pip install dash pandas numpy scikit-learn


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [32]:
import dash
from dash import dcc, html, Input, Output, State
import pandas as pd
import numpy as np
import pickle

### Function Random Line

In [33]:
import pandas as pd
import numpy as np

def create_sample_line(adjustments=None):
    # Initialize an empty dictionary for data
    data = {}

    # Generate random values for each column
    data['sex'] = np.random.choice(['male', 'female'])
    data['age'] = np.random.randint(18, 80)
    data['religion'] = np.random.choice(['cath', 'none', 'other', 'prot'])

    # Convert the single data line into a DataFrame
    df = pd.DataFrame(data, index=[0])

    # Adjust variables based on the provided adjustments
    if adjustments:
        for variable, target in adjustments.items():
            if variable == 'age':
                # Adjust age to match the target mean
                mean_age = df['age'].mean()
                df['age'] = df['age'] + (target - mean_age)
            elif variable in df.columns:
                if isinstance(target, str):
                    # Adjust categorical variable
                    df[variable] = target
                elif isinstance(target, (int, float)):
                    # Adjust numerical variable
                    df[variable] = df[variable] * (target / df[variable].mean())
                else:
                    raise ValueError(f"Invalid target type for {variable}.")
            else:
                raise ValueError(f"Invalid variable name: {variable}")

    return df



# Full Model

## Prep Steps

### Data

In [34]:
subset = pd.read_csv("/Users/tarapfrunder/election-prediction-webapp/election-prediction-webapp/subset.csv")

In [35]:
subset.head()

Unnamed: 0,year,sex,age,educ,income,religion,sg1,sg9,sc1,sc7b,pi1,pm3,vp1,vdn1b,pid1
0,1971,female,39.0,compulsory education,0,prot,French spoken,0,without profession,service class employees,rather not interested,mixed mat,yes,FDP,no
1,1971,female,48.0,primary school,0,prot,French spoken,0,in household,others,rather not interested,mixed mat,yes,PdA,yes
2,1971,female,43.0,compulsory education,0,other,French spoken,0,full-time,others,not interested at all,materialist,yes,SP,no
3,1971,female,33.0,primary school,0,cath,French spoken,0,full-time,semiskilled and unskilled workers,not interested at all,mixed mat,yes,SP,yes
4,1971,male,78.0,primary school,0,prot,French spoken,0,retired,skilled workers/foremen,not interested at all,mixed mat,yes,SP,no


In [36]:
column_drop = ["year"]
subset = subset.drop(column_drop, axis=1)

In [37]:
subset.head()

Unnamed: 0,sex,age,educ,income,religion,sg1,sg9,sc1,sc7b,pi1,pm3,vp1,vdn1b,pid1
0,female,39.0,compulsory education,0,prot,French spoken,0,without profession,service class employees,rather not interested,mixed mat,yes,FDP,no
1,female,48.0,primary school,0,prot,French spoken,0,in household,others,rather not interested,mixed mat,yes,PdA,yes
2,female,43.0,compulsory education,0,other,French spoken,0,full-time,others,not interested at all,materialist,yes,SP,no
3,female,33.0,primary school,0,cath,French spoken,0,full-time,semiskilled and unskilled workers,not interested at all,mixed mat,yes,SP,yes
4,male,78.0,primary school,0,prot,French spoken,0,retired,skilled workers/foremen,not interested at all,mixed mat,yes,SP,no


### Model

In [38]:
from dash import Dash, dcc, html, callback, Input, Output
import dash_bootstrap_components as dbc
import plotly.express as px
import dash_ag_grid as dag
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Identify categorical columns
categorical_cols = subset.select_dtypes(include=['object']).columns

# Exclude 'pid2b' from categorical columns
categorical_cols = [col for col in categorical_cols if col != 'vdn1b']

# Initialize LabelEncoder
label_encoders = {}

# Encode categorical columns
for col in categorical_cols:
    le = LabelEncoder()
    subset[col] = le.fit_transform(subset[col])
    label_encoders[col] = le

# Print label encoders for future reference
print("Label Encoders:")
for col, le in label_encoders.items():
    print(col, ": ", dict(zip(le.classes_, le.transform(le.classes_))))

# Now df contains encoded categorical columns
print("\nEncoded DataFrame:")
print(subset.head())

# Save label encoders to file
import joblib
joblib.dump(label_encoders, 'label_encoders.pkl')


Label Encoders:
sex :  {'female': np.int64(0), 'male': np.int64(1)}
educ :  {'0': np.int64(0), 'basic vocational training': np.int64(1), 'compulsory education': np.int64(2), 'diploma school': np.int64(3), 'high school': np.int64(4), 'higher vocational training': np.int64(5), 'primary school': np.int64(6), 'university': np.int64(7), 'vocational college': np.int64(8), 'vocational education': np.int64(9)}
income :  {' rather high income': np.int64(0), ' rather low income': np.int64(1), '0': np.int64(2), 'high income': np.int64(3), 'low income': np.int64(4), 'medium income': np.int64(5)}
religion :  {'0': np.int64(0), 'cath': np.int64(1), 'none': np.int64(2), 'other': np.int64(3), 'prot': np.int64(4)}
sg1 :  {'0': np.int64(0), 'French spoken': np.int64(1), 'German spoken': np.int64(2), 'Italian spoken': np.int64(3)}
sg9 :  {'0': np.int64(0), 'rural': np.int64(1), 'urban': np.int64(2)}
sc1 :  {'0': np.int64(0), 'disabled': np.int64(1), 'full-time': np.int64(2), 'in family business': np.int6

['label_encoders.pkl']

In [39]:
# Split the data into features and target
X = subset.drop('vdn1b', axis=1)
y = subset['vdn1b']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Step 2: Choose a model (Random Forest Classifier)
model = RandomForestClassifier()

# Step 3: Train the model
model.fit(X_train, y_train)

# Step 4: Evaluate the model
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         CSP       0.06      0.03      0.04        29
         CVP       0.33      0.34      0.33       734
         EVP       0.10      0.04      0.06        78
         FDP       0.30      0.29      0.30       973
         GLP       0.25      0.13      0.17       130
         LPS       0.18      0.07      0.10        86
         LdU       0.29      0.11      0.16        66
         PdA       0.25      0.02      0.04        48
          SP       0.42      0.48      0.45      1341
         SVP       0.39      0.41      0.40       980

    accuracy                           0.36      4465
   macro avg       0.26      0.19      0.20      4465
weighted avg       0.35      0.36      0.35      4465



In [40]:
# Save the trained model to a pickle file
import joblib
joblib.dump(model, 'random_forest_model.pkl')

['random_forest_model.pkl']

## App

In [41]:
from dash import Dash, dcc, html, Input, Output
import dash_bootstrap_components as dbc
import plotly.graph_objs as go
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import joblib  # For loading the pre-trained model

In [42]:
# Load pre-trained model
model_path = 'random_forest_model.pkl'
rfc = joblib.load(model_path)

# Load label encoders
label_encoders = joblib.load('label_encoders.pkl')

In [43]:
# Pre-Define Categorical Options 
income_options = [
    {'label': 'rather high income', 'value': 'rather high income'},
    {'label': 'rather low income', 'value': 'rather low income'},
    {'label': 'high income', 'value': 'high income'},
    {'label': 'low income', 'value': 'low income'},
    {'label': 'middle income', 'value': 'middle income'}
]

religion_options = [
    {'label': 'cath', 'value': 'cath'},
    {'label': 'none', 'value': 'none'},
    {'label': 'other', 'value': 'other'},
    {'label': 'prot', 'value': 'prot'}
]

sg1_options = [
    {'label': 'French spoken', 'value': 'French spoken'},
    {'label': 'German spoken', 'value': 'German spoken'},
    {'label': 'Italian spoken', 'value': 'Italian spoken'}
]

sg9_options = [
    {'label': 'rural', 'value': 'rural'},
    {'label': 'urban', 'value': 'urban'}
]

sc1_options = [
    {'label': 'disabled', 'value': 'disabled'},
    {'label': 'full-time', 'value': 'full-time'},
    {'label': 'in family business', 'value': 'in family business'},
    {'label': 'in household', 'value': 'in household'},
    {'label': 'in training/formation', 'value': 'in training/formation'},
    {'label': 'other', 'value': 'other'},
    {'label': 'part-time', 'value': 'part-time'},
    {'label': 'retired', 'value': 'retired'},
    {'label': 'unemployed', 'value': 'unemployed'},
    {'label': 'without profession', 'value': 'without profession'}
]

sc7b_options = [
    {'label': 'farmers', 'value': 'farmers'},
    {'label': 'other self-employed', 'value': 'other self-employed'},
    {'label': 'others', 'value': 'others'},
    {'label': 'routine non-manual workers', 'value': 'routine non-manual workers'},
    {'label': 'semiskilled and unskilled workers', 'value': 'semiskilled and unskilled workers'},
    {'label': 'service class employees', 'value': 'service class employees'},
    {'label': 'skilled workers/foremen', 'value': 'skilled workers/foremen'}
]

pi1_options = [
    {'label': 'not interested at all', 'value': 'not interested at all'},
    {'label': 'rather interested', 'value': 'rather interested'},
    {'label': 'rather not interested', 'value': 'rather not interested'},
    {'label': 'very interested', 'value': 'very interested'}
]

pm3_options = [
    {'label': 'materialist', 'value': 'materialist'},
    {'label': 'mixed mat', 'value': 'mixed mat'},
    {'label': 'mixed postmat', 'value': 'mixed postmat'},
    {'label': 'post-materialist', 'value': 'post-materialist'}
]

vp1_options = [
    {'label': 'yes', 'value': 'yes'},
    {'label': 'no', 'value': 'no'}
]

pid1_options = [
    {'label': 'yes', 'value': 'yes'},
    {'label': 'no', 'value': 'no'}
]


In [46]:
# Initialize Dash app
app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

# Layout of the app
app.layout = dbc.Container(
    [
        html.H1('Party Support Prediction Based on Demographics', style={'textAlign': 'center'}),
        dbc.Row([
            dbc.Col([
                html.Div("Enter Mean Age:"),
                dcc.Input(value="", type='number', debounce=True, id='mean-age', min=18, max=90, step=1)
            ], width=3),
            dbc.Col([
                html.Div("Select Sex:"),
                dcc.Dropdown(
                    id='sex-dropdown',
                    options=[{'label': 'Female', 'value': 'female'}, {'label': 'Male', 'value': 'male'}],
                    value='0'
                )
            ], width=3),
            dbc.Col([
                html.Div("Select Religion:"),
                dcc.Dropdown(
                    id='religion-dropdown',
                    options=religion_options,
                    value='0'
                )
            ], width=3),
            dbc.Col([
                html.Div("Select Income:"),
                dcc.Dropdown(
                    id='income-dropdown',
                    options=income_options,
                    value='0'
                )
            ], width=3),
            dbc.Col([
                html.Div("Select Language Region:"),
                dcc.Dropdown(
                    id='sg1-dropdown',
                    options=sg1_options,
                    value='0'
                )
            ], width=3),
            dbc.Col([
                html.Div("Select Rural/Urban:"),
                dcc.Dropdown(
                    id='sg9-dropdown',
                    options=sg9_options,
                    value='0'
                )
            ], width=3),
            dbc.Col([
                html.Div("Select Profession:"),
                dcc.Dropdown(
                    id='sc1-dropdown',
                    options=sc1_options,
                    value='0'
                )
            ], width=3),
            dbc.Col([
                html.Div("Select Sector:"),
                dcc.Dropdown(
                    id='sc7b-dropdown',
                    options=sc7b_options,
                    value='0'
                )
            ], width=3),
            dbc.Col([
                html.Div("Select Political Interest:"),
                dcc.Dropdown(
                    id='pi1-dropdown',
                    options=pi1_options,
                    value='0'
                )
            ], width=3),
            dbc.Col([
                html.Div("Select PostMaterialism:"),
                dcc.Dropdown(
                    id='pm3-dropdown',
                    options=pm3_options,
                    value='0'
                )
            ], width=3),
            dbc.Col([
                html.Div("Participated in Federal Elections?:"),
                dcc.Dropdown(
                    id='vp1-dropdown',
                    options=vp1_options,
                    value='0'
                )
            ], width=3),
            dbc.Col([
                html.Div("Attachment to Party?:"),
                dcc.Dropdown(
                    id='pid1-dropdown',
                    options=pid1_options,
                    value='0'
                )
            ], width=3),
            dbc.Col([
                dbc.Button("Generate Predictions", id='button', color="primary", className="mr-1"),
            ], width=3),
        ], className='mb-3'),

        dbc.Row([
            dbc.Col([
                dcc.Graph(id='predicted-class-histogram'),
            ], width=12),
        ]),
    ]
)

# Function to create a sample DataFrame with adjustments
def create_sample_line(adjustments):
    data = {
        'age': [adjustments.get('age', np.random.randint(18, 80))],
        'sex': [adjustments.get('sex', 'male')],  # Default to 'male' if not specified
        'religion': [adjustments.get('religion', 'none')],
        'income': [adjustments.get('income', 'middle income')],   # Default to 'cath' if not specified
        'sg1': [adjustments.get('sg1', 'German spoken')],
        'sg9': [adjustments.get('sg9', 'urban')],
        'sc1': [adjustments.get ('sc1', 'full-time')],
        'sc7b': [adjustments.get('sc7b','service class employees')],
        'pi1': [adjustments.get('pi1', 'rather interested')],
        'pm3': [adjustments.get('pm3', 'mixed postmat')],
        'vp1': [adjustments.get('vp1', 'no')],
        'pid1': [adjustments.get('pid1', 'no')]
        }
    return pd.DataFrame(data)

# Callback to update the predicted class histogram based on user inputs
@app.callback(
    Output('predicted-class-histogram', 'figure'),
    [Input('button', 'n_clicks')],
    [Input('mean-age', 'value'),
     Input('sex-dropdown', 'value'),
     Input('religion-dropdown', 'value'),
     Input('income-dropdown', 'value'),
     Input('sg1-dropdown', 'value'),
     Input('sg9-dropdown', 'value'), 
     Input('sc1-dropdown', 'value'),
     Input('sc7b-dropdown', 'value'),
     Input('pi1-dropdown', 'value'),
     Input('pm3-dropdown', 'value'),
     Input('vp1-dropdown', 'value'),
     Input('pid1-dropdown', 'value')]
)
def update_predicted_class_histogram(n_clicks, mean_age, sex_value, religion_value, income_value, sg1_value,
                                     sg9_value, sc1_value, sc7b_value, pi1_value, pm3_value,
                                     vp1_value, pid1_value):
    if n_clicks is None:
        return go.Figure()

    # Create adjustments dictionary for create_sample_line function
    adjustments = {
        'age': int(mean_age) if mean_age else None,
        'sex': sex_value,  # Adjust based on dropdown value
        'religion': religion_value,
        'income': income_value,
        'sg1': sg1_value,
        'sg9': sg9_value,
        'sc1': sc1_value,
        'sc7b': sc7b_value,
        'pi1': pi1_value,
        'pm3': pm3_value,
        'vp1': vp1_value,
        'pid1': pid1_value
    }

    # Generate sample DataFrame
    input_data = create_sample_line(adjustments)

    # Encode categorical columns using label encoders
    categorical_cols = input_data.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        le = label_encoders[col]
        input_data[col] = le.transform(input_data[col])

    # Ensure the input data matches the expected feature order
    expected_features = rfc.feature_names_in_
    input_data = input_data.reindex(columns=expected_features, fill_value=0)

    # Predict with the pre-trained model
    predicted_probabilities = rfc.predict_proba(input_data)[0]

    # Create a histogram of predicted probabilities for all classes
    fig = go.Figure(data=[go.Bar(x=rfc.classes_, y=predicted_probabilities, marker_color='blue')])
    fig.update_layout(title="Predicted Class Probabilities", xaxis_title="Class", yaxis_title="Probability")
    return fig




In [47]:
# run app
if __name__ == '__main__':
    app.run_server(debug=True)