In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Load the data
train_data = pd.read_csv('/content/drive/MyDrive/Spring_2024/RDS/Project/train.csv')
print(train_data.head())
print(train_data.dtypes)

# Define the identity columns of interest
identity_columns = ['male', 'female', 'black', 'white', 'jewish', 'christian', 'muslim']

In [None]:
# Calculate average toxicity scores for comments with any identity information vs. those without
identity_present = train_data.dropna(subset=identity_columns)
identity_absent = train_data[identity_columns].isna().all(axis=1)

average_toxicity = pd.DataFrame({
    "Condition": ["With Identity Info", "Without Identity Info"],
    "Average Toxicity Score": [
        identity_present['target'].mean(),
        train_data.loc[identity_absent, 'target'].mean()
    ]
})

# Calculate the percentage of missing values in identity columns
missing_percentage = (train_data[identity_columns].isnull().mean() * 100).reset_index()
missing_percentage.columns = ['Identity', 'Percentage Missing']

print("Average Toxicity Scores:")
print(average_toxicity.to_string(index=False))

print("\nPercentage of Missing Data for Identity Columns:")
print(missing_percentage.to_string(index=False))



In [None]:
# Stacked Bar Graph for Missing Data Visualization
plt.figure(figsize=(10, 6))
non_missing_counts = (train_data[identity_columns].notna().sum()).tolist()
missing_counts = (train_data[identity_columns].isna().sum()).tolist()
labels = identity_columns

x = np.arange(len(labels))
width = 0.35

plt.bar(x - width/2, non_missing_counts, width, label='Present', color='green')
plt.bar(x + width/2, missing_counts, width, label='Missing', color='red')

plt.ylabel('Counts')
plt.title('Presence of Identity Information in Comments')
plt.xticks(x, labels, rotation=45)
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Calculate average toxicity scores for comments with and without identity information
identity_present = train_data.dropna(subset=identity_columns)
identity_absent = train_data[identity_columns].isna().all(axis=1)

print("\nAverage Toxicity Scores:")
print(f"With Identity Info: {identity_present['target'].mean()}")
print(f"Without Identity Info: {train_data.loc[identity_absent, 'target'].mean()}")

# Correlation Matrix of Numeric Features including identity mentions
numerical_columns = ['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat'] + identity_columns
correlation_matrix = train_data[numerical_columns].corr()

# Heatmap of correlations
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix of Numeric and Identity Features')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()