In [1]:
import sys
import os
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,accuracy_score, precision_score, recall_score, f1_score,classification_report

***Congressional Voting Dataset***

In [2]:
# Set random seed for reproducibility
np.random.seed(42)

df_vote = pd.read_csv("CongressionalVotingID.shuf.lrn.csv", index_col="ID")
df_vote = df_vote.applymap(lambda x: 1 if x == "y" else 0 if x == "n" else x)
df_vote = df_vote.replace("democrat", 1)
df_vote = df_vote.replace("republican", 0)
df_vote = df_vote.replace("unknown", 0)


In [3]:
x_train, x_test, y_train, y_test = train_test_split(df_vote.iloc[:,1:], df_vote.iloc[:, 0],test_size=0.4, random_state=11)

In [4]:
rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True, ) 

param_grid = {
    'n_estimators': [300, 200, 400],
    'max_features': ['sqrt', 'log2', None],
    'min_samples_split': [2, 5, 10]  # New category: min_samples_split
}
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5,scoring="f1_micro")
CV_rfc.fit(x_train, y_train)
CV_rfc.best_params_

{'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 300}

In [5]:
y_pred=CV_rfc.predict(x_test)
print('F1-score:', f1_score(y_test, y_pred))
print('The best parameters are:', CV_rfc.best_params_)
CV_rfc.best_params_

F1-score: 0.9607843137254902
The best parameters are: {'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 300}


{'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 300}

***Spam dataset***

In [6]:
data = pd.read_csv("../datasets/spam_final_df.data")

In [7]:
x_train, x_test, y_train, y_test = train_test_split(data.iloc[:, :-1], data.iloc[:, -1], test_size=0.2, random_state=26)
scaler = StandardScaler()
X_train = scaler.fit_transform(x_train)
X_test = scaler.transform(x_test)

In [8]:
rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True, ) 

param_grid = {
    'n_estimators': [300, 200, 400],
    'max_features': ['sqrt', 'log2', None],
    'min_samples_split': [2, 5, 10]  # New category: min_samples_split
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5,scoring="f1_micro")
CV_rfc.fit(x_train, y_train)
CV_rfc.best_params_

{'max_features': 'log2', 'min_samples_split': 2, 'n_estimators': 400}

In [9]:
y_pred=CV_rfc.predict(x_test)
print('F1-score:', f1_score(y_test, y_pred))
print('The best parameters are:', CV_rfc.best_params_)

F1-score: 0.9415041782729805
The best parameters are: {'max_features': 'log2', 'min_samples_split': 2, 'n_estimators': 400}


***Flag dataset***

In [10]:
#Load dataset

#flags_dir: str = "../datasets/flags/"
#flags_path: str = os.path.abspath(os.path.join(flags_dir, "flag.data"))

religions: dict = {0: "Catholic", 1: "Other Christian", 2: "Muslim", 3: "Buddhist", 4: "Hindu", 5: "Ethnic", 6: "Marxist", 7: "Others"}
languages: dict = {1: "English", 2: "Spanish", 3: "French", 4: "German", 5: "Slavic", 6: "Other Indo-European", 7: "Chinese", 8: "Arabic", 9: "Japanese/Turkish/Finnish/Magyar", 10: "Others"}

columns: list = [
    "name",
    "landmass",
    "zone",
    "area",
    "population",
    "language",
    "religion",
    "bars",
    "stripes",
    "colours",
    "red",
    "green",
    "blue",
    "gold",
    "white",
    "black",
    "orange",
    "mainhue",
    "circles",
    "crosses",
    "saltires",
    "quarters",
    "sunstars",
    "crescent",
    "triangle",
    "icon",
    "animate",
    "text",
    "topleft",
    "botright",
]

flags_raw_df = pd.read_csv("../datasets/flags/flag.data", sep=",", header=None, names=columns)
flags_raw_df

Unnamed: 0,name,landmass,zone,area,population,language,religion,bars,stripes,colours,...,saltires,quarters,sunstars,crescent,triangle,icon,animate,text,topleft,botright
0,Afghanistan,5,1,648,16,10,2,0,3,5,...,0,0,1,0,0,1,0,0,black,green
1,Albania,3,1,29,3,6,6,0,0,3,...,0,0,1,0,0,0,1,0,red,red
2,Algeria,4,1,2388,20,8,2,2,0,3,...,0,0,1,1,0,0,0,0,green,white
3,American-Samoa,6,3,0,0,1,1,0,0,5,...,0,0,0,0,1,1,1,0,blue,red
4,Andorra,3,1,0,0,6,0,3,0,3,...,0,0,0,0,0,0,0,0,blue,red
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,Western-Samoa,6,3,3,0,1,1,0,0,3,...,0,1,5,0,0,0,0,0,blue,red
190,Yugoslavia,3,1,256,22,6,6,0,3,4,...,0,0,1,0,0,0,0,0,blue,red
191,Zaire,4,2,905,28,10,5,0,0,4,...,0,0,0,0,0,1,1,0,green,green
192,Zambia,4,2,753,6,10,5,3,0,4,...,0,0,0,0,0,0,1,0,green,brown


In [11]:
named_df = flags_raw_df
named_df["language"].replace(languages, inplace=True)
named_df["religion"].replace(religions, inplace=True)

In [12]:
to_one_hot: list = ["mainhue", "landmass", "zone", "language", "topleft", "botright"]
one_hot_df = named_df

for col in to_one_hot:
    one_hot_df = pd.get_dummies(data=one_hot_df, columns=[col], prefix=col)

In [13]:
to_normalize: list = ["area", "population", "bars", "stripes", "colours", "circles", "crosses", "saltires", "quarters", "sunstars"]
normalized_df = one_hot_df
unscaled_df = one_hot_df.copy()
for col in to_normalize:
    normalized_df[col] = normalized_df[col].div(normalized_df[col].max())


In [14]:
df = normalized_df
random_state: int = 0
np.random.seed(42)
X = df.drop("religion", axis=1)
y = df["religion"]
religion_encoder = LabelEncoder()
religion_encoder.fit(y)
y = religion_encoder.transform(y)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=1/3)

train_countries = x_train["name"]
test_countries = x_test["name"]

x_train.drop("name", axis=1, inplace=True)
x_test.drop("name", axis=1, inplace=True)

In [15]:
np.unique(y_train)

array([0, 1, 2, 3, 4, 5, 6, 7])

In [16]:
rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True ) 

param_grid = {
    'n_estimators': [300, 200, 400],
    'max_features': ['sqrt', 'log2', None],
    'min_samples_split': [2, 5, 10]  # New category: min_samples_split
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5,scoring="f1_micro")
CV_rfc.fit(x_train, y_train)
CV_rfc.best_params_



{'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 300}

In [17]:
y_pred=CV_rfc.predict(x_test)
#print('F1-score:', f1_score(y_test, y_pred))
print('F1-score:', f1_score(y_test, y_pred, average='macro'))
print('The best parameters are:', CV_rfc.best_params_)

F1-score: 0.5370413154779731
The best parameters are: {'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 300}
