In [71]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [72]:
train_df = pd.read_csv('train.csv')

In [None]:
train_df.info()

In [None]:
test_df = pd.read_csv('test.csv')
test_df.info()

In [75]:
num_cols = train_df.select_dtypes(include=['int64', 'float64']).columns.to_list()
cat_cols = train_df.select_dtypes(include=['object']).columns.to_list()

In [None]:
n_cols = len(num_cols)
n_rows = -(-n_cols // 4)

plt.figure(figsize=(20, 6 * n_rows))
for i, col in enumerate(cat_cols, 1):
    plt.subplot(n_rows, 4, i)
    sns.countplot(x=col, data=train_df)
    plt.title(f"{col} distribution")
    plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

In [None]:
cols = train_df.drop(["id", "NObeyesdad"], axis=1).columns.to_list()
n_rows = -(-len(cols) // 4)
plt.figure(figsize=(20,20))

for i, col in enumerate(cols, 1):
    plt.subplot(n_rows, 4, i)
    sns.scatterplot(x=col, y="NObeyesdad", data=train_df)
    plt.title(f"{col} vs NObeyesdad")
plt.tight_layout()
plt.show()

In [78]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
train_df["NObeyesdad"] = label_encoder.fit_transform(train_df["NObeyesdad"])

train_df = pd.get_dummies(train_df)
test_df = pd.get_dummies(test_df)

In [None]:
cormat = train_df.corr()
sns.heatmap(cormat.drop("id"), fmt=".2f")

In [80]:
from sklearn.model_selection import train_test_split

X = train_df.drop(["id", "NObeyesdad"], axis=1)
y = train_df["NObeyesdad"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score

train_preds = model.predict(X_val)
accuracy = accuracy_score(y_val, train_preds)
print(f"Validation accuracy: {accuracy}")

In [83]:
if "CALC_Always" in test_df.columns:
    test_df.drop("CALC_Always", axis=1, inplace=True)
test_preds = model.predict(test_df.drop("id", axis=1))

In [None]:
confmat = pd.crosstab(y_val, train_preds, rownames=["Actual"], colnames=["Predicted"])
plt.figure(figsize=(10, 6))
sns.heatmap(confmat, annot=True, fmt="d", cmap="Greens")
plt.title("Confusion matrix")
plt.show()

In [None]:
submission = pd.DataFrame({"id": test_df["id"], "NObeyesdad": label_encoder.inverse_transform(test_preds)})
submission.head()

In [86]:
submission.to_csv("submission.csv", index=False)