In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.multioutput import MultiOutputClassifier
import numpy as np
# Load the dataset
train_df = pd.read_csv('/kaggle/input/my-files/train.csv')

# Check the column names
print(train_df.columns)

# Check if the 'winner' column is created properly
def get_winner(row):
    if row['winner_model_a'] == 1:
        return 'a'
    elif row['winner_model_b'] == 1:
        return 'b'
    else:
        return 'tie'

train_df['winner'] = train_df.apply(get_winner, axis=1)

# Display the first few rows to ensure 'winner' column is created
print(train_df.head())

# Check if 'winner' column exists in the DataFrame
print('winner' in train_df.columns)

# Visualize the distribution of the 'winner' variable
sns.countplot(data=train_df, x='winner')
plt.title("Distribution of Winners")
plt.xlabel("Winner")
plt.ylabel("Count")
plt.show()

# Feature engineering: Adding length of responses as features
train_df['response_a_length'] = train_df['response_a'].apply(len)
train_df['response_b_length'] = train_df['response_b'].apply(len)

# Display the correlation matrix
correlation_matrix = train_df[['response_a_length', 'response_b_length']].corr()
sns.heatmap(correlation_matrix, annot=True)
plt.title("Correlation Matrix")
plt.show()

# Prepare the data for training
X = train_df[['response_a_length', 'response_b_length']]
y = train_df['winner']

# One-hot encode the target variable
y = pd.get_dummies(y)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = MultiOutputClassifier(RandomForestClassifier())
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict_proba(X_val)

# Convert list of arrays to a single array
y_pred = np.array([pred[:, 1] for pred in y_pred]).T

# Evaluate the model
loss = log_loss(y_val, y_pred)
print(f'Log Loss: {loss}')

# Prepare the submission file
# Load the test set
test_df = pd.read_csv('/kaggle/input/my-files/test.csv')

# Add feature engineering steps for the test set
test_df['response_a_length'] = test_df['response_a'].apply(len)
test_df['response_b_length'] = test_df['response_b'].apply(len)

# Make predictions on the test set
test_pred = model.predict_proba(test_df[['response_a_length', 'response_b_length']])

# Convert list of arrays to a single array
test_pred = np.array([pred[:, 1] for pred in test_pred]).T

# Format the predictions for submission
submission = pd.DataFrame({
    'id': test_df['id'],
    'winner_model_a': test_pred[:, 0],
    'winner_model_b': test_pred[:, 1],
    'winner_tie': test_pred[:, 2]
})

# Save the submission file
submission.to_csv('submission.csv', index=False)
