In [1]:
#Importing all the important libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
#loading the train data
train_df = pd.read_csv("/kaggle/input/week4-practice-hackathon-2024/Train.csv")

In [3]:
#checking the correlation among various parameters
corr = train_df.corr()

In [4]:
#creating a normalizing function
def normalize_num_cols(df):
    scaler = StandardScaler()
    numeric_cols = df.select_dtypes(include=['number']).columns
    
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    return df

In [5]:
# Get the absolute value of the correlation
cor_target = abs(corr["class"])

# Select highly correlated features 
relevant_features = cor_target[cor_target>0.05]
# Collect the names of the features
names = [index for index, value in relevant_features.items()]

# Drop the target variable from the results
names.remove('class')

# Display the results
print(names)

['V1', 'V2', 'V4', 'V5', 'V6', 'V7', 'V8', 'V10', 'V11', 'V13', 'V33', 'V36']


In [6]:
cols=names
print(cols)

['V1', 'V2', 'V4', 'V5', 'V6', 'V7', 'V8', 'V10', 'V11', 'V13', 'V33', 'V36']


In [7]:
X=train_df[cols]
X.head()

Unnamed: 0,V1,V2,V4,V5,V6,V7,V8,V10,V11,V13,V33,V36
0,0.0,1.0,29,4528.0,138.0,3.0,29,0.0,3,29.0,0,7
1,0.0,20.0,14,4183.0,183.0,4.0,26,0.0,2,23.0,3,6
2,14.0,2.0,9,1113.0,15.0,37.0,31,1.0,3,9.0,3,5
3,0.0,175.0,16,4250.0,68.0,8.0,30,0.0,5,16.0,2,6
4,2.0,-1.0,3,0.0,0.0,2.0,5,1.0,1,0.0,3,6


In [9]:
#normalizing the data
normalize_num_cols(X)
X.head()

Unnamed: 0,V1,V2,V4,V5,V6,V7,V8,V10,V11,V13,V33,V36
0,-0.418695,-0.272333,2.320468,0.587431,0.251593,-0.308159,0.823911,-1.030996,-0.227171,1.855821,-1.535646,0.479198
1,-0.418695,-0.223524,0.643003,0.515392,0.436736,-0.296182,0.608346,-1.030996,-0.367593,1.320058,1.057712,-0.024495
2,0.987937,-0.269764,0.083849,-0.125648,-0.254465,0.099077,0.967622,0.36954,-0.227171,0.069942,1.057712,-0.528189
3,-0.418695,0.174653,0.866665,0.529382,-0.036407,-0.248272,0.895767,-1.030996,0.053674,0.695,0.19326,-0.024495
4,-0.217747,-0.27747,-0.587137,-0.358051,-0.316179,-0.320137,-0.900613,0.36954,-0.508016,-0.733704,1.057712,-0.024495


In [11]:
#defining the target variable 
y=train_df['class']
print(y)

0        1
1        1
2        0
3        1
4        1
        ..
40771    1
40772    0
40773    1
40774    0
40775    1
Name: class, Length: 40776, dtype: int64


In [12]:
#splitting the data into test and train sets 
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3)

In [13]:
print(y_train.value_counts())

class
1    14378
0    14165
Name: count, dtype: int64


In [14]:
#importing the model 
from sklearn.naive_bayes import GaussianNB
model=GaussianNB()

In [15]:
model.fit(X_train, y_train)

In [16]:
#predicting the target variable 
y_pred = model.predict(X_val)
print(y_pred)

[0 1 0 ... 0 0 0]


In [18]:
#calculating the f1 score 
f1 = f1_score(y_val, y_pred)
print(f"F1 score: {f1}")

F1 score: 0.5932705248990578


In [21]:
#loading the test dataset
test_data = pd.read_csv("/kaggle/input/week4-practice-hackathon-2024/Test.csv")

In [22]:
Index=test_data['Index']
test_data.drop(['Index'],axis=1,inplace=True)
columns_to_keep = ['V1', 'V2', 'V4', 'V5', 'V6', 'V7', 'V8', 'V10', 'V11', 'V13', 'V33', 'V36']
test_data = test_data[columns_to_keep]


In [23]:
test_data.shape

(17476, 12)

In [25]:
#predicting the target variable over the test dataset
y_test = model.predict(test_data)

In [26]:
#creating a new dataframe equivalent to the submission format 
result = pd.DataFrame({
    'Index': Index,
    'class': y_test
})

In [28]:
#viewing the result
result

Unnamed: 0,Index,class
0,40977,1
1,24157,0
2,56238,1
3,55411,1
4,53175,0
...,...,...
17471,17197,1
17472,14094,0
17473,34598,1
17474,17845,1


In [29]:
result.to_csv("submission8.csv", index=False) #(for making submission file)