In [1]:
#Import drive
from google.colab import drive
#Mount Google Drive
ROOT="/content/drive"
drive.mount(ROOT, force_remount=True)

ModuleNotFoundError: No module named 'google'

In [None]:
!pip install dash pandas scikit-learn plotly

from dash import Dash, html, dcc, callback, Output, Input
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score
import plotly.express as px
import requests
import io


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report


In [None]:
import numpy as np

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder

# Read Data

In [None]:
data_path = "https://raw.githubusercontent.com/lauramauricio/election-prediction-webapp/efe5785ebf31c3bf48d528e9aa2b6ebc7fa46d29/merged_dataset.csv"
df = pd.read_csv(data_path)

In [None]:
column_drop = ["med1"]
df_clean = df.drop(column_drop, axis=1)

In [None]:
df_clean.head()

# Descriptives

In [None]:
# Group by year and calculate the share of voter for each party
yearly_party_share = df.groupby('year')['vdn1b'].value_counts(normalize=True).rename('share').reset_index()

print(yearly_party_share)  # Display the first few rows of the result

In [None]:
# Filter for the "sps/pss" party
sps_pss_share = yearly_party_share[yearly_party_share['vdn1b'] == 'sps/pss'][['year', 'share']]

# Calculate 200 divided by 'total_gewahlt' for each year
df['total_gewahlt_proportion'] = df['total_gewahlt']/200

# Calculate the average proportion per year
total_gewahlt_proportion_by_year = df.groupby('year')['total_gewahlt_proportion'].mean().reset_index()

# Merge the party share with the total_gewahlt proportion by year
comparison_df = pd.merge(sps_pss_share, total_gewahlt_proportion_by_year, on='year')
comparison_df.rename(columns={'share': 'sps_pss_share', 'total_gewahlt_proportion': 'total_gewahlt_proportion'}, inplace=True)

print("Comparison DataFrame:")
print(comparison_df)

# Transform into suitable Data

In [None]:
# Replace "NaN" values with 0 for both numeric and categorical data
df_clean = df_clean.fillna(0)

In [None]:
df_clean.head()


In [None]:
# Adjust display options to show more columns
pd.set_option('display.max_columns', None)

# Print all column names
print("Column names:", df_clean.columns)

# Print the first few rows
print("First few rows:")
print(df_clean.head())

In [None]:
column_drop = ["party", "total_gewahlt", "total_men", "total_women", "lr1", "pid2b"]
df_clean = df_clean.drop(column_drop, axis=1)

In [None]:
# Identify categorical columns
categorical_cols = df_clean.select_dtypes(include=['object']).columns

# Exclude 'pid2b' from categorical columns
categorical_cols = [col for col in categorical_cols if col != 'vdn1b']

# Encode categorical columns using LabelEncoder
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_clean[col] = le.fit_transform(df_clean[col].astype(str))
    label_encoders[col] = le


In [None]:
df_clean = df_clean.drop('year', axis=1)
df_clean

In [None]:
df_clean["vdn1b"].unique()

array(['sps/pss', 'no party identification', 'lps/pls', 'cvp/pdc',
       'pda/pdt', 'rep. (& vigil.)', 'csp/pcs', 'fdp/prd', 0, 'ldu/adi',
       'svp/udc', 'other parties', 'sd/ds', 'evp/pep', 'poch', 'fps/psl',
       'left parties', 'gps/pes', 'fga/avf', 'right parties', 'edu/udf',
       'sol.', 'lega', 'other comments', 'psa (psu)', 'mcg', 'bdp',
       "GLP/Vert'libéraux", 'centre parties'], dtype=object)

# Class Classification

In [None]:
df_clean.head()

In [None]:
# Convert vdn1b to a categorical text variable
df_clean['vdn1b'] = df_clean['vdn1b'].astype('category')

# Verify the conversion
df_clean['vdn1b'].dtype  # Should show 'category'

In [None]:
import pandas as pd

# Dictionary to rename categories
category_rename_mapping = {
    0: 'unknown',  # Example for numeric 0
    'GLP/Vert\'libéraux': "GLP",
    'bdp': 'BDP',
    'centre parties': 'Centre Parties',
    'csp/pcs': 'CSP',
    'cvp/pdc': 'CVP',
    'edu/udf': 'EDU',
    'evp/pep': 'EVP',
    'fdp/prd': 'FDP',
    'fga/avf': 'FGA',
    'fps/psl': 'FPS',
    'gps/pes': 'GPS',
    'ldu/adi': 'LdU',
    'left parties': 'Left Parties',
    'lega': 'Lega',
    'lps/pls': 'LPS',
    'mcg': 'MCG',
    'other comments': 'Other Comments',
    'other parties': 'Other Parties',
    'pda/pdt': 'PdA',
    'poch': 'POCH',
    'psa (psu)': 'PSA',
    'rep. (& vigil.)': 'Rep',
    'right parties': 'Right Parties',
    'sd/ds': 'SD',
    'sol.': 'Sol',
    'sps/pss': 'SP',
    'svp/udc': 'SVP',
    'voted blank': 'Voted Blank'
}

# Ensure the dictionary keys are matching the current categories
print("Category rename mapping keys:", category_rename_mapping.keys())

# Rename categories
df_clean['vdn1b'] = df_clean['vdn1b'].cat.rename_categories(category_rename_mapping)

# Verify the renaming
print("Renamed categories:", df_clean['vdn1b'].cat.categories)
print(df_clean.head())


## All Classes

In [None]:
# Split the data into features and target
X = df_clean.drop('vdn1b', axis=1)
y = df_clean['vdn1b']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Step 2: Choose a model (Random Forest Classifier)
model = RandomForestClassifier()

# Step 3: Train the model
model.fit(X_train, y_train)

# Step 4: Evaluate the model
predictions = model.predict(X_test)

In [None]:
print(classification_report(y_test, predictions))

In [None]:
# Step 5: Save the trained model to a pickle file
with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(model, file)

## Most Common Parties

In [None]:
# Count occurrences of each party
party_counts = df_clean['vdn1b'].value_counts()

party_counts

In [None]:
subset = df_clean[df_clean['vdn1b'].isin(['FDP','CVP','SP','SVP',
                                                        'LPS','LdU','EVP','CSP',
                                                        "PdA",'PSA', "GLP"])]

subset.head()

In [None]:
# save subset data
subset.to_csv('subset.csv', index=False)

In [None]:
# Split the data into features and target
X = subset.drop('vdn1b', axis=1)
y = subset['vdn1b']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [None]:
print(subset.columns)

In [None]:
subset

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Step 2: Choose a model (Random Forest Classifier)
model = RandomForestClassifier()

# Step 3: Train the model
model.fit(X_train, y_train)

# Step 4: Evaluate the model
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

# Deep Learning Model

In [None]:
def sigmoid(z):
  z_clipped = np.clip(z, -500, 500)
  s = 1 / (1 + np.exp(-z_clipped))
  return s

In [None]:
def initialize_with_zeros(X):
  num_samples, num_features = X.shape
  weights = np.zeros(num_features).reshape(-1, 1)
  bias = 0.0
  assert(weights.shape == (num_features, 1))
  assert(isinstance(bias, float) or isinstance(bias, int))
  return weights, bias

In [None]:
def compute_cost(Y, y_predicted):
  epsilon = 1e-15 # Small constant to avoid division by zero
  num_samples = len(Y)
  cost = (-1/num_samples) * np.sum(Y * np.log(y_predicted + epsilon) + (1 - Y) * np.log(1 - y_predicted + epsilon))
  return cost

In [None]:
def propagate(weights, bias, X, Y):
  assert X.shape[0] == Y.shape[0], "Number of samples in X and Y must match"
  num_samples, num_features = X.shape

  linear_model = np.dot(weights.T, X.T) + bias
  y_predicted = sigmoid(linear_model)
  # Compute cost
  cost = compute_cost(Y, y_predicted)
  # BACKWARD PROPAGATION
  dw = (1 / num_samples) * np.dot(X.T, ((y_predicted- Y.T).T))
  db = (1 / num_samples) * np.sum(y_predicted-Y.T)

  assert(dw.shape == weights.shape)
  assert(db.dtype == float)
  cost = np.squeeze(cost)
  assert(cost.shape == ())
  grads = {"dw": dw, "db": db}
  return grads, cost

In [None]:
# GRADED FUNCTION: optimize

def optimize(weights, bias, X, Y, num_iterations, learning_rate):
    """
    This function optimizes w and b by running a gradient descent algorithm

    Arguments:
    weights -- weights, a numpy array of size (1, num_features)
    bias -- bias, a scalar
    X -- data of shape (number of examples, num_features)
    Y -- true "label" vector (containing 0 if non-nith, 1 if nith), of shape (number of examples, 1)
    num_iterations -- number of iterations of the optimization loop
    learning_rate -- learning rate of the gradient descent update rule


    Returns:
    params -- dictionary containing the weights w and bias b
    grads -- dictionary containing the gradients of the weights and bias with respect to the cost function
    costs -- list of all the costs computed during the optimization, this will be used to plot the learning curve.

    Tips:
    You basically need to write down two steps and iterate through them:
        1) Calculate the cost and the gradient for the current parameters. Use propagate().
        2) Update the parameters using gradient descent rule for w and b.
    """
    costs_history=[]
    # Gradient descent
    for i in range(num_iterations):


        # Cost and gradient calculation (≈ 1-4 lines of code)
        ### START CODE HERE ###
        grads, cost = propagate(weights,bias,X,Y)
        ### END CODE HERE ###

        # Retrieve derivatives from grads
        dw = np.array(grads["dw"], dtype=float)
        db =  np.array(grads["db"], dtype=float)

        # update rule (≈ 2 lines of code)
        ### START CODE HERE ###
        weights = weights.T - learning_rate * dw
        bias -= learning_rate * db
        ### END CODE HERE ###

        # Record the costs
        if i % 100 == 0:
            costs_history.append(np.mean(cost))

            # Print the cost every 100 training examples
            print ("Cost after iteration %i: %f" %(i, np.mean(cost)))

        weights=weights.T
        #print(weights.shape)

    params = {"w": weights,
                "b": bias}

    grads = {"dw": dw,
              "db": db}

    return params, grads, costs_history

In [None]:
# GRADED FUNCTION: predict

def predict(weights, bias, X):
    '''
    Predict whether the label is 0 or 1 using learned logistic regression parameters (w, b)

    Arguments:
    weights -- weights, a numpy array of size (num_samples, 1)
    bias -- bias, a scalar
    X -- data of size (num_samples, num_features)

    Returns:
    Y_prediction -- a numpy array (vector) containing all predictions (0/1) for the examples in X
    '''
    # Assertion for dimensions
    #assert X.shape[1] == len(weights), "Number of features in X must match the size of weights vector"

    linear_model = np.dot(weights, X.T) + bias
    y_predicted = sigmoid(np.array(linear_model, dtype=float))
    y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted[0]]

    return y_predicted_cls

In [None]:
# GRADED FUNCTION: model

def model(X_train, Y_train, X_test, Y_test, num_iterations = 2000, learning_rate = 0.5):

    """
      Builds the logistic regression model by calling the function you've implemented previously

      Arguments:
      X_train -- training set represented by a numpy array of shape (num_samples_train, num_features)
      Y_train -- training labels represented by a numpy array (vector) of shape (num_samples_train, num_features)
      X_test -- test set represented by a numpy array of shape (num_samples_test, num_features)
      Y_test -- test labels represented by a numpy array (vector) of shape (num_samples_test, 1)
      num_iterations -- hyperparameter representing the number of iterations to optimize the parameters
      learning_rate -- hyperparameter representing the learning rate used in the update rule of optimize()


      Returns:
      d -- dictionary containing information about the model.
    """

    ### START CODE HERE ###

    # initialize parameters with zeros (≈ 1 line of code)

    weights, bias = initialize_with_zeros(X_train)

    # Gradient descent (≈ 1 line of code)
    parameters, grads, costs_history =  optimize(weights, bias, X_train, Y_train, num_iterations, learning_rate)

    # Retrieve parameters w and b from dictionary "parameters"
    weights = parameters["w"]
    bias = parameters["b"]

    # Predict test/train set examples (≈ 2 lines of code)
    Y_prediction_test = predict(weights, bias, X_test)
    Y_prediction_train = predict(weights, bias, X_train)


    ### END CODE HERE ###

    # Print train/test Errors

    accuracy_test = np.mean(Y_prediction_test ==Y_test.reshape(1,-1))
    print("Test Accuracy:", accuracy_test)

    accuracy_train = np.mean(Y_prediction_train == Y_train.reshape(1,-1))
    print("Test Accuracy:", accuracy_train)

    # Plot cost over iterations
    import matplotlib.pyplot as plt
    plt.plot(range(len(costs_history)), costs_history)
    plt.xlabel('Iterations')
    plt.ylabel('Cost')
    plt.title('Cost vs. Iterations')
    plt.show()

    d = {"costs": costs_history,
        "Y_prediction_test": Y_prediction_test,
        "Y_prediction_train" : Y_prediction_train,
        "w" : weights,
        "b" : bias,
        "learning_rate" : learning_rate,
        "num_iterations": num_iterations}

    return d

# Apply to Data

In [None]:
### START CODE HERE ###
subset['vdn1b'] = pd.Categorical(subset['vdn1b']).codes
df_dummies_model = subset.copy()
# Setting 'y' to 1 where it equals "SVP"
df_dummies_model.loc[df_dummies_model['vdn1b'] == "SVP", 'vdn1b'] = 1
df_dummies_model.loc[df_dummies_model['vdn1b'] != 1, 'vdn1b'] = 0

In [None]:
# train, test split
X_train,\
X_test, \
y_train,\
y_test = train_test_split(df_dummies_model.iloc[:,1:], df_dummies_model['vdn1b'], test_size=0.3)

# Display the shapes of the training and testing sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

In [None]:
dict_data=model(X_train.values, y_train.values, X_test.values, y_test.values, num_iterations = 1000, learning_rate = 0.1)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Extract predictions from the dictionary
y_pred = dict_data['predictions']

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("Confusion Matrix:")
print(conf_matrix)


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load your data (replace with your own data loading code)
# data = pd.read_csv('your_dataset.csv')
# X = data.drop('target', axis=1)
# y = data['target']

# Example data (for demonstration purposes)
X, y = np.random.rand(43000, 10), np.random.randint(2, size=43000)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Random Forest metrics
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_precision = precision_score(y_test, y_pred_rf)
rf_recall = recall_score(y_test, y_pred_rf)
rf_f1 = f1_score(y_test, y_pred_rf)
rf_auc = roc_auc_score(y_test, y_pred_rf)

# Train and evaluate Neural Network
model = Sequential([
    Dense(64, input_dim=X_train.shape[1], activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=0)

y_pred_nn = (model.predict(X_test) > 0.5).astype("int32")

# Neural Network metrics
nn_accuracy = accuracy_score(y_test, y_pred_nn)
nn_precision = precision_score(y_test, y_pred_nn)
nn_recall = recall_score(y_test, y_pred_nn)
nn_f1 = f1_score(y_test, y_pred_nn)
nn_auc = roc_auc_score(y_test, y_pred_nn)

# Compare Metrics
metrics = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC'],
    'Random Forest': [rf_accuracy, rf_precision, rf_recall, rf_f1, rf_auc],
    'Neural Network': [nn_accuracy, nn_precision, nn_recall, nn_f1, nn_auc]
})

print(metrics)