In [5]:
import pandas as pd
from scipy import sparse
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer 

In [6]:
content = sparse.load_npz('csv/sparse_matrix_for_text.npz')
title = sparse.load_npz('csv/sparse_matrix_for_title.npz') 

collected_matrix = hstack([content,title]) 
sparse.save_npz("csv/sparse_matrix_for_all.npz", collected_matrix)

labels = pd.read_csv('csv/labels.csv')
features = sparse.load_npz('csv/sparse_matrix_for_all.npz')



- changing to linearSVC for faster training time instead of SVM. 

In [7]:
X = features
y = labels 
y = np.ravel(y)
y

array([ 2,  7,  7, ..., 10, 10, 10])

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report 
from imblearn.over_sampling import RandomOverSampler 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.5, random_state=0)

oversampler = RandomOverSampler(random_state=0)
X_train_oversampled, y_train_oversampled = oversampler.fit_resample(X_train, y_train)

model = ComplementNB() 
model.fit(X_train_oversampled, y_train_oversampled)

print(model.score(X_train_oversampled, y_train_oversampled))
print(model.score(X_val, y_val))

predictions_NB = model.predict(X_val)

print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, y_val)*100)

y_val_pred = model.predict(X_val) 

report = classification_report(y_val, y_val_pred)
print("classification report:")
print(report)

0.9959242010404491
0.8434984031772236
Naive Bayes Accuracy Score ->  84.34984031772235
classification report:
              precision    recall  f1-score   support

           0       0.86      0.96      0.90     49640
           1       0.46      0.84      0.59      2638
           2       0.90      0.95      0.92     63216
           3       0.77      0.72      0.75      5781
           4       0.85      0.68      0.76     59590
           5       0.90      0.79      0.84     57876
           6       0.55      0.90      0.68      9241
           7       0.49      0.90      0.63     17394
           8       0.56      0.70      0.62      8626
           9       1.00      0.98      0.99     68565
          10       0.87      0.78      0.82    120227

    accuracy                           0.84    462794
   macro avg       0.75      0.83      0.77    462794
weighted avg       0.86      0.84      0.85    462794



In [9]:
'''

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

# Define a range of C values to test
C_values = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

# Initialize the best_C and best_score variables
best_C = None
best_score = 0

# Iterate through the C_values and fit the model with each value
for C in C_values:
    # Create a Logistic Regression model with the current C value
    model = LogisticRegression(max_iter=1500, class_weight = 'balanced', random_state = 0, C=C)
    
    # Fit the model to the training data
    model.fit(xv_train, y_train)
    
    # Predict the validation data
    y_val_pred = model.predict(xv_val)
    
    # Calculate the accuracy score for the current C value
    score = accuracy_score(y_val, y_val_pred)
    
    # Print the current C value and its accuracy score
    print(f"C: {C}, Accuracy: {score}")
    
    # Update the best_C and best_score variables if the current score is higher than the previous best
    if score > best_score:
        best_C = C
        best_score = score

# Print the best C value and its accuracy score
print(f"Best C: {best_C}, Best Accuracy: {best_score}")

'''

'\n\nfrom sklearn.linear_model import LogisticRegression\n\nfrom sklearn.metrics import accuracy_score\n\n# Define a range of C values to test\nC_values = [0.001, 0.01, 0.1, 1, 10, 100, 1000]\n\n# Initialize the best_C and best_score variables\nbest_C = None\nbest_score = 0\n\n# Iterate through the C_values and fit the model with each value\nfor C in C_values:\n    # Create a Logistic Regression model with the current C value\n    model = LogisticRegression(max_iter=1500, class_weight = \'balanced\', random_state = 0, C=C)\n    \n    # Fit the model to the training data\n    model.fit(xv_train, y_train)\n    \n    # Predict the validation data\n    y_val_pred = model.predict(xv_val)\n    \n    # Calculate the accuracy score for the current C value\n    score = accuracy_score(y_val, y_val_pred)\n    \n    # Print the current C value and its accuracy score\n    print(f"C: {C}, Accuracy: {score}")\n    \n    # Update the best_C and best_score variables if the current score is higher than 

In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB

# Define the parameter grid for the grid search
param_grid = [0.00001,0.0001,0.001, 0.01,0.1,1,10,100,1000,10000]

# Initialize the best_C and best_score variables
best_alpha = None
best_score = 0

for pg in param_grid:
    model = ComplementNB(alpha=pg)
    
    model.fit(X_train_oversampled, y_train_oversampled)
    
    y_val_pred = model.predict(X_val)
    
    # Calculate the accuracy score for the current C value
    score = accuracy_score(y_val, y_val_pred)
    
    # Print the current C value and its accuracy score
    print(f"alpha: {pg}, Accuracy: {score}")
    
    # Update the best_C and best_score variables if the current score is higher than the previous best
    if score > best_score:
        best_alpha = pg
        best_score = score
    
# Print the best C value and its accuracy score
print(f"\nBest alpha: {best_alpha}, Best Accuracy: {best_score}")
    

alpha: 1e-05, Accuracy: 0.8335285245703272
alpha: 0.0001, Accuracy: 0.8371910612497137
alpha: 0.001, Accuracy: 0.8420463532370774
alpha: 0.01, Accuracy: 0.8477141017385705
alpha: 0.1, Accuracy: 0.8516813096107555
alpha: 1, Accuracy: 0.8434984031772236
alpha: 10, Accuracy: 0.8146497145598257
alpha: 100, Accuracy: 0.7805546312182094
alpha: 1000, Accuracy: 0.7472266278300929
alpha: 10000, Accuracy: 0.6992268698384162

Best alpha: 0.1, Best Accuracy: 0.8516813096107555


In [None]:
'''

from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

# Define the parameter grid for the grid search
param_grid = {'alpha': [0.00001,0.0001,0.001, 0.01,0.1,1,10,100,1000,10000]}

# Create the MultinomialNB model
model = MultinomialNB()

# Create the grid search object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the grid search object to the oversampled training data
grid_search.fit(X_train_oversampled, y_train_oversampled)

# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_
print("Best hyperparameters found:", best_params)

# Train the model with the best hyperparameters
best_model = MultinomialNB(**best_params)
best_model.fit(X_train_oversampled, y_train_oversampled)

# Evaluate the model
val_accuracy = best_model.score(X_val, y_val)
print("Validation accuracy with tuned hyperparameters:", val_accuracy)

'''

Best hyperparameters found: {'alpha': 0.001}
Validation accuracy with tuned hyperparameters: 0.81884596602376


In [None]:
"""

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report 
from imblearn.over_sampling import RandomOverSampler 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.5, random_state=0)

oversampler = RandomOverSampler(random_state=0)
X_train_oversampled, y_train_oversampled = oversampler.fit_resample(X_train, y_train)

model = MultinomialNB(**best_params) 
model.fit(X_train_oversampled, y_train_oversampled)



print(model.score(X_train_oversampled, y_train_oversampled))
print(model.score(X_val, y_val))

predictions_NB = model.predict(X_val)

print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, y_val)*100)

y_val_pred = model.predict(X_val) 

report = classification_report(y_val, y_val_pred)
print("classification report:")
print(report)

"""

0.9995912088710703
0.7245102572634908
Naive Bayes Accuracy Score ->  72.45102572634909
classification report:
              precision    recall  f1-score   support

           0       0.74      0.44      0.55     49640
           1       0.93      0.28      0.43      2638
           2       0.90      0.86      0.88     63216
           3       0.96      0.40      0.56      5781
           4       0.80      0.52      0.63     59590
           5       0.82      0.61      0.70     57876
           6       0.89      0.54      0.67      9241
           7       0.80      0.17      0.28     17394
           8       0.89      0.26      0.40      8626
           9       0.92      0.97      0.95     68565
          10       0.55      0.93      0.69    120227

    accuracy                           0.72    462794
   macro avg       0.84      0.54      0.61    462794
weighted avg       0.77      0.72      0.71    462794

