In [64]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

In [65]:
def add_pca_features(df, n_components=37):
    embeddings = df.drop(
        [
            "label",
            "title",
            "genre",
            "description",
            "description_t",
            "first_token",
            "last_token",
            "embedding",
        ],
        axis=1,
    )
    labels = df["genre"]
    scaler = StandardScaler()
    scaler.fit(embeddings)
    embeddings = scaler.transform(embeddings)

    pca = PCA(n_components=n_components)
    principalComponents = pca.fit_transform(embeddings)

    print(
        "Cumulative explained variance:", np.cumsum(pca.explained_variance_ratio_)[-1]
    )

    principalDf = pd.DataFrame(
        data=principalComponents,
        columns=[f"pc_{i}" for i in range(1, n_components + 1)],
    )
    labels = labels.reset_index(drop=True)
    finalDf = pd.concat([principalDf, labels], axis=1)
    return finalDf, pca

In [66]:
def create_clustered_df(pca_df, n_clusters=8):

    pca_mat = pca_df.drop(columns=["genre"])
    kmeans = KMeans(
        n_clusters=n_clusters, init="random", max_iter=300, n_init=10, random_state=29
    )
    kmeans.fit(pca_mat)
    pca_df["cluster"] = kmeans.labels_

    return pca_df

In [37]:
random_state=42

X_original = pca_df.drop("genre", axis=1)
y = pca_df["genre"]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_original, y, test_size=0.2, random_state=random_state
    )
# Reset indexes to align correctly after splitting
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [74]:
n_e = [100, 200, 500]
l_r = [0.05, 0.1, 0.2]


hyperparams = []
accuracies = []

for i in n_e:

    for j in l_r:
    
        hyperparams.append((i,j))
    
        model = AdaBoostClassifier(n_estimators=i, learning_rate=j, random_state=0)
    
        model.fit(X_train, y_train)     
    
        y_pred = model.predict(X_test)
        accuracies.append(accuracy_score(y_test, y_pred))
    
    best_param = [(hyperparams, round(accuracy, 3)) for hyperparams, accuracy in zip(hyperparams, accuracies)]
    
    best_param = sorted(best_param, key = lambda x: x[1], reverse = True)
    
[print('Hyperparameters: {} \t\t Accuracy: {}'.format(*i)) for i in best_param];


Hyperparameters: (500, 0.2) 		 Accuracy: 0.59
Hyperparameters: (500, 0.1) 		 Accuracy: 0.579
Hyperparameters: (200, 0.2) 		 Accuracy: 0.574
Hyperparameters: (500, 0.05) 		 Accuracy: 0.557
Hyperparameters: (100, 0.2) 		 Accuracy: 0.549
Hyperparameters: (200, 0.1) 		 Accuracy: 0.549
Hyperparameters: (100, 0.1) 		 Accuracy: 0.523
Hyperparameters: (200, 0.05) 		 Accuracy: 0.523
Hyperparameters: (100, 0.05) 		 Accuracy: 0.493


In [61]:
n_e = [100, 200]
m_d = [10, 20]



hyperparams = []
accuracies = []

for i in n_e:

    for j in m_d:
    
        hyperparams.append((i,j))
    
        model = RandomForestClassifier(n_estimators=i, max_depth=j, random_state=0)
    
        model.fit(X_train, y_train)     
        
        y_pred = model.predict(X_test)
    
        accuracies.append(accuracy_score(y_test, y_pred))

    best_param = [(hyperparams, round(accuracy, 3)) for hyperparams, accuracy in zip(hyperparams, accuracies)]
    
    best_param = sorted(best_param, key = lambda x: x[1], reverse = True)
    
[print('Hyperparameters: {} \t\t Accuracy: {}'.format(*i)) for i in best_param]
print(1)

0.3564134844034131
0.3564134844034131
[((100, 20), 0.601), ((100, 10), 0.566)]
0.3564134844034131
0.3564134844034131
[((200, 20), 0.602), ((100, 20), 0.601), ((100, 10), 0.566), ((200, 10), 0.566)]
Hyperparameters: (200, 20) 		 CV_Score: 0.602
Hyperparameters: (100, 20) 		 CV_Score: 0.601
Hyperparameters: (100, 10) 		 CV_Score: 0.566
Hyperparameters: (200, 10) 		 CV_Score: 0.566
1
