In [82]:
import pandas as pd
from sklearn.base import ClassifierMixin
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings('ignore')

algos = (
    LogisticRegression(max_iter=1000),   
    MultinomialNB(), 
    RandomForestClassifier(n_estimators=100),  
    DecisionTreeClassifier(), 
    KNeighborsClassifier(), 
)

In [36]:
X = pd.read_csv("Training Data/Vectorized_X.csv")
X.head()

Unnamed: 0,10,100,1000,11,12,13,14,15,16,17,...,yo,youll,young,youtu,youtub,youtubecom,youv,zero,zombi,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
y = pd.read_csv("Training Data/Vectorized_y.csv")

In [78]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoded_y = encoder.fit_transform(y)

y = pd.Series(encoded_y)
y.head()

0    3
1    3
2    3
3    3
4    3
dtype: int32

In [75]:
class SampleIndices:
    def __init__(self, y):
        self.y = y

    def stratifiedIndices(self, sample_rate:float) -> dict:
        label_indices = []
        for label in self.y.unique():
            label_indices.append(self.y[self.y == label][:int(sample_rate * len(self.y[self.y == label]))].index.tolist())
        label_indices = [val for sublist in label_indices for val in sublist] 
        return label_indices

    def getSamples(self, sample_rate:float=0.7, stratify:bool=True) -> pd.DataFrame:
        """ Sample_rate: float range from 0.1 to 1.0, default=70
            statify: bool, default True """
        if stratify:
            label_indices = self.stratifiedIndices(sample_rate)
            return label_indices
        return self.y.index.tolist()[:int(sample_rate * len(self.y))]
        

sampler = SampleIndices(y=y)
indices = sampler.getSamples(sample_rate=0.4, stratify=True)
print(len(indices))

29996


In [80]:
X_samples = X.iloc[indices]

In [94]:
from abc import ABC, abstractmethod
class HyperParameterTunerAbstract(ABC):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    @abstractmethod
    def gridSearch(self, algo:ClassifierMixin, param_grid:dict):
       pass

    @abstractmethod
    def getBestParams(self, param_grid:dict):
        pass

class HyperParameterTuner(HyperParameterTunerAbstract):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def gridSearch(self, algo:ClassifierMixin, param_grid:dict):
        grid_search = GridSearchCV(estimator=algo, param_grid=param_grid)
        grid_search.fit(self.X, self.y)
        return grid_search

    def getBestParams(self, algo:ClassifierMixin, param_grid:dict):
        grid_search = self.gridSearch(algo, param_grid)
        return grid_search

tuner = HyperParameterTuner(X, y)

In [None]:
lr_param_grid = {
    'C': [0.1, 1],
    'penalty': ['l1']
}
lr_grid_search = tuner.getBestParams(LogisticRegression(), param_grid=lr_param_grid)
lr_grid_search