In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder


train_df= pd.read_csv('/kaggle/input/summer-analytics/hacktrain.csv')
test_df= pd.read_csv('/kaggle/input/summer-analytics/hacktest.csv')


In [None]:
train_df.info()
train_df.describe()
#check missing values
train_df.isnull().sum()

In [None]:
train_df.head()

In [None]:
# Step 2: Preprocessing
label_encoder = LabelEncoder()
train_df['class'] = label_encoder.fit_transform(train_df['class'])

X = train_df.drop(['ID', 'class'], axis=1)
y = train_df['class']

# Fill missing values
X = X.fillna(X.mean())

# Step 3: Train/Test Split and Model
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report

# Generate predictions
y_pred = model.predict(X_val)

# Convert label encoder classes to string (if needed)
target_names = [str(c) for c in label_encoder.classes_]

# Print the classification report
print(classification_report(y_val, y_pred, target_names=target_names))


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_val, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
disp.plot(cmap='Blues')

In [None]:
# Drop unnecessary columns (adjust if needed)
X_test = test_df.drop(['ID'], axis=1)

# Predict
test_preds = model.predict(X_test)

# Decode class labels (if you encoded them)
test_preds_labels = label_encoder.inverse_transform(test_preds)

In [None]:
#Stacking 
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

base_models = [
    ('rf', RandomForestClassifier(n_estimators=100)),
    ('dt', DecisionTreeClassifier(max_depth=5)),
]

meta_model = LogisticRegression()

stacked_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)
stacked_model.fit(X_train, y_train)
y_pred = stacked_model.predict(X_val)

In [None]:
#Use XG boost
from xgboost import XGBClassifier

model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

In [None]:
#logistic regression
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

In [None]:
#hyperparameter tuning

from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best params:", grid_search.best_params_)
best_model = grid_search.best_estimator_

In [None]:
#feature engineering
import pandas as pd

importances = pd.Series(model.feature_importances_, index=X_train.columns)
importances.sort_values(ascending=False).plot(kind='bar')

In [None]:
#Handle class
import numpy as np
np.bincount(y_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

In [None]:
#Cross validation 
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X_train, y_train, cv=5)
print("Cross-validation scores:", scores)
print("Mean accuracy:", scores.mean())

In [None]:
from lightgbm import LGBMClassifier
model = LGBMClassifier(boosting_type='dart')

In [None]:
#Analysing errors 
wrong = X_val[y_val != y_pred]
print(wrong.head())

In [None]:
#Feature selection 
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(f_classif, k=10)
X_new = selector.fit_transform(X, y)

In [None]:
from catboost import CatBoostClassifier


cat = CatBoostClassifier(verbose=0, random_state=42)


cat.fit(X_train, y_train)

# Get prediction probabilities for ensembling
cat_probs = cat.predict_proba(X_test)


In [None]:
#import and build model
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Initialize base models
rf = RandomForestClassifier(n_estimators=200, random_state=42)
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
cat = CatBoostClassifier(verbose=0, random_state=42)

In [None]:
#create voting classifier
voting_clf = VotingClassifier(
    estimators=[
        ('rf', rf),
        ('xgb', xgb),
        ('cat', cat)
    ],
    voting='soft'  
)

In [None]:
#train the ensemble
voting_clf.fit(X_train, y_train)

In [None]:
#predict 
voting_probs = voting_clf.predict_proba(X_test)
final_preds = np.argmax(voting_probs, axis=1)

# Decode if using label encoder
final_preds_labels = label_encoder.inverse_transform(final_preds)



In [None]:
# Create a submission DataFrame
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'class': test_preds_labels
})

# Save the submission file
submission.to_csv('submission.csv', index=False)