Library:

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import mode
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, random_split
import os

from google.colab import drive

In [2]:
# Function to apply majority vote
def majority_vote(row):
  predictions = row[1:].values
  return mode(predictions)[0]

# Get all CSV file names in the current directory
csv_files = [file for file in os.listdir() if file.endswith('.csv')]

# Read each CSV file and store in a list
dfs = []
for file in csv_files:
  df = pd.read_csv(file)
  if 'Predicted' in df.columns and 'ID' in df.columns:
    dfs.append(df)

# Ensure there is at least one valid DataFrame
if not dfs:
  raise ValueError("No valid CSV files found.")

# Combine the predictions into a single DataFrame
combined_df = pd.DataFrame({'ID': dfs[0]['ID']})
for i, df in enumerate(dfs, start=1):
  combined_df[f'Pred{i}'] = df['Predicted']

# Apply majority vote for each row
combined_df['Predicted'] = combined_df.apply(majority_vote, axis=1)

# Create final DataFrame with 'ID' and majority voted predictions
final_df = combined_df[['ID', 'Predicted']]

# Optionally, save the final DataFrame to a new CSV file
final_df.to_csv('majority_vote_predictions_combined_mix09.csv', index=False)


In [None]:
df_old = pd.read_csv('./pre_vote/majority_vote_predictions6_2.csv')

In [None]:
# Merge df1 and final_df on the 'ID' column
comparison_df = pd.merge(df_old, final_df, on='ID', how='left')

# Compare predictions
# Assuming the prediction column in df1 is still named 'Predicted'
comparison_df['Is_Match'] = comparison_df['Predicted_x'] == comparison_df['Predicted_y']

# View the comparison DataFrame
print(comparison_df.head())

# Count the number of rows where predictions do not match
mismatch_count = comparison_df[comparison_df['Is_Match'] == False].shape[0]

# Total number of rows in the DataFrame
total_rows = comparison_df.shape[0]

# Calculate the proportion of mismatched rows
mismatch_proportion = mismatch_count / total_rows

print(f"Number of mismatched rows: {mismatch_count}")
print(f"Total number of rows: {total_rows}")
print(f"Proportion of mismatched rows: {mismatch_proportion:.2f}")

   ID  Predicted_x  Predicted_y  Is_Match
0   0            5            5      True
1   1            6            7     False
2   2            3            3      True
3   3            6            6      True
4   4            4            4      True
Number of mismatched rows: 12492
Total number of rows: 100000
Proportion of mismatched rows: 0.12


In [None]:
# Load the CSV files
df1 = pd.read_csv('FFNN_200-0.5_100-0.5_50-0.5_SGD90_e70_onlyNoisy01_ordinal.csv')
df2 = pd.read_csv('FFNN_200-0.5_100-0.5_50-0.5_e28_noisy01_ordinal.csv')
df3 = pd.read_csv('FFNN_200-0.5_100-0.5_50-0.5_e35_onlyNoisy001_ordinal.csv')
df4 = pd.read_csv('FFNN_200-0.5_100-0.5_50-0.5_e48_onlyNoisy01_ordinal.csv')
df5 = pd.read_csv('FFNN_200-0.5_100-0.5_50-0.5_e55_onlyNoisy01_ordinal.csv')
df6 = pd.read_csv('FFNN_200-0.5_100-0.5_e30_onlyNoisy01_ordinal.csv')
df7 = pd.read_csv('FFNN_256-0.5_128-0.5_64-0.5_e36_onlyNoisy01_ordinal.csv')
df8 = pd.read_csv('FFNN_256-0.5_128-0.5_64-0.5_ordinal_noisy01.csv')
df9 = pd.read_csv('FFNN_256-0.5_128-0.5_64-0.5_ordinal_only_noisy01.csv')

df10 = pd.read_csv('FFNN_128-0.5_64-0.3_32-0.3.csv')
df11 = pd.read_csv('FFNN_128-0.5_64-0.3_32-0.3_ordinal.csv')
df12 = pd.read_csv('FFNN_128-0.5_64-0.3_32-0.3_ordinal_noisy001.csv')
df13 = pd.read_csv('FFNN_128-0.5_64-0.3_32-0.3_ordinal_noisy01.csv')
df14 = pd.read_csv('FFNN_128-0.5_64-0.3_32-0.3_ordinal_only_noisy01.csv')
df15 = pd.read_csv('FFNN_256-0.5_128-0.5_64-0.5_e22_noisy01_ordinal.csv')
df16 = pd.read_csv('submission_FFNN_load1.csv')

In [None]:
# Combine the predictions into a single DataFrame, using the 'ID' column to align them
combined_df_c9 = pd.DataFrame({
    'ID': df1['ID'],
    'Pred1': df1['Predicted'],
    'Pred2': df2['Predicted'],
    'Pred3': df3['Predicted'],
    'Pred4': df4['Predicted'],
    'Pred5': df5['Predicted'],
    'Pred6': df6['Predicted'],
    'Pred7': df7['Predicted'],
    'Pred8': df8['Predicted'],
    'Pred9': df9['Predicted'],
})

# Function to apply majority vote
def majority_vote(row):
  predictions = row[1:].values
  return mode(predictions)[0]

# Apply majority vote for each row
combined_df_c9['Predicted'] = combined_df_c9.apply(majority_vote, axis=1)

# Create final DataFrame with 'ID' and majority voted predictions
final_df_c9 = combined_df_c9[['ID', 'Predicted']]

# Optionally, save the final DataFrame to a new CSV file
final_df_c9.to_csv('majority_vote_predictions6_1.csv', index=False)

In [None]:
# Merge df1 and final_df on the 'ID' column
comparison_df_c9 = pd.merge(df_old, final_df_c9, on='ID', how='left')

# Compare predictions
# Assuming the prediction column in df1 is still named 'Predicted'
comparison_df_c9['Is_Match'] = comparison_df_c9['Predicted_x'] == comparison_df_c9['Predicted_y']

# View the comparison DataFrame
print(comparison_df_c9.head())

# Count the number of rows where predictions do not match
mismatch_count = comparison_df_c9[comparison_df_c9['Is_Match'] == False].shape[0]

# Total number of rows in the DataFrame
total_rows = comparison_df_c9.shape[0]

# Calculate the proportion of mismatched rows
mismatch_proportion = mismatch_count / total_rows

print(f"Number of mismatched rows: {mismatch_count}")
print(f"Total number of rows: {total_rows}")
print(f"Proportion of mismatched rows: {mismatch_proportion:.2f}")

   ID  Predicted_x  Predicted_y  Is_Match
0   0            4            5     False
1   1            6            6      True
2   2            4            3     False
3   3            6            6      True
4   4            4            4      True
Number of mismatched rows: 26089
Total number of rows: 100000
Proportion of mismatched rows: 0.26


In [None]:
# Combine the predictions into a single DataFrame, using the 'ID' column to align them
combined_df_c16 = pd.DataFrame({
    'ID': df1['ID'],
    'Pred1': df1['Predicted'],
    'Pred2': df2['Predicted'],
    'Pred3': df3['Predicted'],
    'Pred4': df4['Predicted'],
    'Pred5': df5['Predicted'],
    'Pred6': df6['Predicted'],
    'Pred7': df7['Predicted'],
    'Pred8': df8['Predicted'],
    'Pred9': df9['Predicted'],
    'Pred10': df10['Predicted'],
    'Pred11': df11['Predicted'],
    'Pred12': df12['Predicted'],
    'Pred13': df13['Predicted'],
    'Pred14': df14['Predicted'],
    'Pred15': df15['Predicted'],
    'Pred16': df16['Predicted'],
})

# Function to apply majority vote
def majority_vote(row):
    predictions = row[1:].values
    return mode(predictions)[0]

# Apply majority vote for each row
combined_df_c16['Predicted'] = combined_df_c16.apply(majority_vote, axis=1)

# Create final DataFrame with 'ID' and majority voted predictions
final_df_c16 = combined_df_c16[['ID', 'Predicted']]

# Optionally, save the final DataFrame to a new CSV file
final_df_c16.to_csv('majority_vote_predictions6_2.csv', index=False)

In [None]:
# Merge df1 and final_df on the 'ID' column
comparison_df_c16 = pd.merge(df_old, final_df_c16, on='ID', how='left')

# Compare predictions
# Assuming the prediction column in df1 is still named 'Predicted'
comparison_df_c16['Is_Match'] = comparison_df_c16['Predicted_x'] == comparison_df_c16['Predicted_y']

# View the comparison DataFrame
print(comparison_df_c16.head())

# Count the number of rows where predictions do not match
mismatch_count = comparison_df_c16[comparison_df_c16['Is_Match'] == False].shape[0]

# Total number of rows in the DataFrame
total_rows = comparison_df_c16.shape[0]

# Calculate the proportion of mismatched rows
mismatch_proportion = mismatch_count / total_rows

print(f"Number of mismatched rows: {mismatch_count}")
print(f"Total number of rows: {total_rows}")
print(f"Proportion of mismatched rows: {mismatch_proportion:.2f}")

   ID  Predicted_x  Predicted_y  Is_Match
0   0            4            5     False
1   1            6            6      True
2   2            4            3     False
3   3            6            6      True
4   4            4            4      True
Number of mismatched rows: 21292
Total number of rows: 100000
Proportion of mismatched rows: 0.21


In [None]:
# Combine the predictions into a single DataFrame, using the 'ID' column to align them
combined_df_c3 = pd.DataFrame({
    'ID': df1['ID'],
    'Pred1': df1['Predicted'],
    'Pred2': df2['Predicted'],
    'Pred3': df3['Predicted'],
})

# Function to apply majority vote
def majority_vote(row):
  predictions = row[1:].values
  return mode(predictions)[0]

# Apply majority vote for each row
combined_df_c3['Predicted'] = combined_df_c3.apply(majority_vote, axis=1)

# Create final DataFrame with 'ID' and majority voted predictions
final_df_c3 = combined_df_c3[['ID', 'Predicted']]

# Optionally, save the final DataFrame to a new CSV file
final_df_c3.to_csv('majority_vote_predictions5.3.csv', index=False)

In [None]:
# Merge df1 and final_df on the 'ID' column
comparison_df_c3 = pd.merge(df_old, final_df_c3, on='ID', how='left')

# Compare predictions
# Assuming the prediction column in df1 is still named 'Predicted'
comparison_df_c3['Is_Match'] = comparison_df_c3['Predicted_x'] == comparison_df_c3['Predicted_y']

# View the comparison DataFrame
print(comparison_df_c3.head())

# Count the number of rows where predictions do not match
mismatch_count = comparison_df_c3[comparison_df_c3['Is_Match'] == False].shape[0]

# Total number of rows in the DataFrame
total_rows = comparison_df_c3.shape[0]

# Calculate the proportion of mismatched rows
mismatch_proportion = mismatch_count / total_rows

print(f"Number of mismatched rows: {mismatch_count}")
print(f"Total number of rows: {total_rows}")
print(f"Proportion of mismatched rows: {mismatch_proportion:.2f}")

   ID  Predicted_x  Predicted_y  Is_Match
0   0            4            5     False
1   1            6            6      True
2   2            4            4      True
3   3            6            6      True
4   4            4            4      True
Number of mismatched rows: 20482
Total number of rows: 100000
Proportion of mismatched rows: 0.20


Load model and predict

In [None]:
class FFNN(nn.Module):
  def __init__(self, input_size, hidden_sizes, output_size, dropout_rates, batch_norm=False):
    super(FFNN, self).__init__()
    self.layers = nn.ModuleList()  # ModuleList to hold all layers
    self.hidden_sizes = hidden_sizes
    self.dropout_rates = dropout_rates

    # Create layers based on the hidden_sizes and dropout_rates
    last_size = input_size
    for hidden_size, dropout_rate in zip(hidden_sizes, dropout_rates):
      self.layers.append(nn.Linear(last_size, hidden_size))
      if batch_norm:
        self.layers.append(nn.BatchNorm1d(hidden_size))
      self.layers.append(nn.Dropout(dropout_rate))
      last_size = hidden_size

    # Output layer
    self.layers.append(nn.Linear(last_size, output_size))

  def forward(self, x):
    for layer in self.layers[:-1]:
      if isinstance(layer, nn.Linear):
        x = F.relu(layer(x))
      else:
        x = layer(x)
    # No activation function in the last layer
    x = self.layers[-1](x)
    return x

  def generate_filename(self):
    filename = "FFNN_" + "_".join(f"{hs}-{dr}" for hs, dr in zip(self.hidden_sizes, self.dropout_rates))
    return filename


In [None]:
model_load1 = torch.load("FFNN_[128, 64, 32]_1_[0.5, 0.3, 0.3]_True_0001_500_wd.pth")
model_load2 = torch.load("FFNN_[128, 64]_1_[0.5, 0.4]_True_0001_500_wd.pth")
model_load3 = torch.load("FFNN_128-0.5_64-0.4_64-0.4_32-0.3.pth")

model_load1.eval()
model_load2.eval()
model_load3.eval()

FFNN(
  (layers): ModuleList(
    (0): Linear(in_features=2096, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): Dropout(p=0.4, inplace=False)
    (6): Linear(in_features=64, out_features=64, bias=True)
    (7): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): Dropout(p=0.4, inplace=False)
    (9): Linear(in_features=64, out_features=32, bias=True)
    (10): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): Dropout(p=0.3, inplace=False)
    (12): Linear(in_features=32, out_features=1, bias=True)
  )
)

In [None]:
# List to store predictions
predictions_avg3 = []

with torch.no_grad():
  for inputs in test_standard_normalized_dataloader:
    inputs = inputs[0]
    inputs = inputs.to(device)

    # Forward pass
    outputs_2 = model_load2(inputs).squeeze()

    # Process the outputs (e.g., rounding/clamping)
    processed_output = torch.clamp(outputs_2.round(), 0, 7)

    # Store the predictions
    predictions_load2.append(int(processed_output.cpu().item()))

# 'predictions' now contains the processed predictions for your test dataset