In [None]:
import pandas as pd
from sklearn.utils import shuffle

# Loading Data

## pandas.read_csv

In [None]:
file_path = "../../data/raw/"

X = pd.read_csv(f"{file_path}x_train_gr_smpl.csv", delimiter=',')
X.head()

In [None]:
Y = pd.read_csv(f"{file_path}y_train_smpl.csv", delimiter=',')
Y.columns = ['target']

Yn = []

for i in range(0, 10):
    Yn.append(pd.read_csv(f"{file_path}y_train_smpl_{i}.csv", delimiter=','))
    Yn[-1].columns = [f"target_{i}"]

Y.tail()

# Create datasets

In [None]:
train_smpl = pd.concat([X, Y], axis=1)

for y in Yn:
    train_smpl = pd.concat([train_smpl, y], axis=1)

cols = train_smpl.columns[:-11]

# Data Randomisation

## sklear.utils.suffle

In [None]:
train_smpl = shuffle(train_smpl, random_state=42)
train_smpl.head()

In [None]:
train_smpl.info()

In [None]:
train_smpl.describe()

# Reducing the size

# Features/Attributes Selection

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
print(train_smpl.shape)

best_20 = SelectKBest(chi2, k=20).fit_transform(train_smpl[train_smpl.columns[:2303]], train_smpl['target'])
best_20 = pd.DataFrame(best_20)
train_smpl_20 = pd.concat([best_20, train_smpl['target']], axis=1)
print(train_smpl_20.shape)

best_50 = SelectKBest(chi2, k=50).fit_transform(train_smpl[train_smpl.columns[:2303]], train_smpl['target'])
best_50 = pd.DataFrame(best_50)
train_smpl_50 = pd.concat([best_50, train_smpl['target']], axis=1)
print(train_smpl_50.shape)

best_100 = SelectKBest(chi2, k=100).fit_transform(train_smpl[train_smpl.columns[:2303]], train_smpl['target'])
best_100 = pd.DataFrame(best_100)
train_smpl_100 = pd.concat([best_100, train_smpl['target']], axis=1)
print(train_smpl_100.shape)

# Test Train Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_smpl[train_smpl.columns[:2303]], train_smpl['target'], test_size=0.33, random_state=42)

X_train_20, X_test_20, y_train_20, y_test_20 = train_test_split(train_smpl_20[train_smpl_20.columns[:20]], train_smpl_20['target'], test_size=0.33, random_state=42)
X_train_50, X_test_50, y_train_50, y_test_50 = train_test_split(train_smpl_50[train_smpl_50.columns[:50]], train_smpl_50['target'], test_size=0.33, random_state=42)
X_train_100, X_test_100, y_train_100, y_test_100 = train_test_split(train_smpl_100[train_smpl_100.columns[:100]], train_smpl_100['target'], test_size=0.33, random_state=42)

train_test_splits_results = []

for i in range(0, 10):
    result = train_test_split(train_smpl[train_smpl.columns[:2303]], train_smpl[f"target_{i}"], test_size=0.33, random_state=42)
    train_test_splits_results.append(tuple(result))
    
mapped_result = list(map(list, zip(*train_test_splits_results)))
X_train_10classes = mapped_result[0]
X_test_10classes = mapped_result[1]
y_train_10classes = mapped_result[2]
y_test_10classes = mapped_result[3]

In [None]:
X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(train_smpl[train_smpl.columns[:2303]], train_smpl['target_0'], test_size=0.33, random_state=42)
print(train_smpl[train_smpl.columns[:2303]].head())
print(train_smpl['target_0'].head())
print(train_smpl['target_1'].head())

# Modeling

## Multinomial Naive Bayes model (multi-class classifier)

### Before Features/Attributes Selection

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

### After Features/Attributes Selection (20/50/100)

In [None]:
X_trains_reduced = [  X_train_20, X_train_50, X_train_100]
X_tests_reduced = [ X_test_20, X_test_50, X_test_100]
y_trains_reduced = [ y_train_20, y_train_50, y_train_100]
y_tests_reduced = [ y_test_20, y_test_50, y_test_100]

for X_trains, X_tests, y_trains, y_tests in zip(X_trains_reduced, X_tests_reduced, y_trains_reduced, y_tests_reduced):    
    clf = MultinomialNB()
    clf.fit(X_trains, y_trains)
    print(clf.score(X_tests, y_tests))

## Gaussian Naive Bayes model (mono-class classifier)

In [None]:
from sklearn.naive_bayes import GaussianNB

for X_train, X_test, y_train, y_test in zip(X_train_10classes, X_test_10classes, y_train_10classes, y_test_10classes):    
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    print(clf.score(X_test, y_test))