In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
import pickle
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
from sklearn.calibration import calibration_curve

In [None]:
dataset = pd.read_csv('split_1.csv')
dataset

In [None]:
plt.figure(figsize=(15,6))
sns.heatmap(dataset.isnull().transpose(),cbar_kws={'label': 'Missing Data'})
#No missing values visible, Good data !

In [None]:
print(dataset[dataset['tweets'].isnull()].index)

In [None]:
dataset.dropna(inplace=True)

In [None]:
nan_indices = dataset[dataset.isna().any(axis=1)].index
print(nan_indices)

In [None]:
X = dataset['tweets']
y = dataset['is_anxious']
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y.values.reshape(-1,1), test_size=0.05, random_state=4)
len(X_train),len(X_test)

In [None]:
max_features_values = [10000, 11000]

tfidf_vectorizer = TfidfVectorizer()
 
param_grid = {
    'max_features': max_features_values,
}

grid_search = GridSearchCV(tfidf_vectorizer, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_max_features = grid_search.best_params_['max_features']

print(f"Best max_features: {best_max_features}")

best_tfidf_vectorizer = grid_search.best_estimator_

In [None]:
pickle.dump(best_tfidf_vectorizer, open('grid_search_50000_tfidf.pkl', 'wb'))

In [None]:
best_tfidf_vectorizer = pickle.load(open('grid_search_50000_tfidf.pkl', 'rb'))

In [None]:
X_train_tweets_tfidf = best_tfidf_vectorizer.transform(X_train)
X_test_tweets_tfidf = best_tfidf_vectorizer.transform(X_test)

num_train_samples, _ = X_train_tweets_tfidf.shape
num_test_samples, _ = X_test_tweets_tfidf.shape

print(f"Number of training samples: {num_train_samples}")
print(f"Number of test samples: {num_test_samples}")

In [None]:
# Convert your sparse TF-IDF matrices to dense numpy arrays
X_train_tweets_dense = X_train_tweets_tfidf.toarray()
X_test_tweets_dense = X_test_tweets_tfidf.toarray()

In [None]:
nb = GaussianNB()
nb2 = BernoulliNB()
nb3 = MultinomialNB()
VotingClassifiers = VotingClassifier(estimators=[('GaussianNB', nb),('BernoulliNB',nb2), ('MultinomialNB', nb3)], voting='soft')

VotingClassifiers.fit(X_train_tweets_dense, y_train)

In [None]:
print('Training score:', VotingClassifiers.score(X_train_tweets_dense, y_train))
print('Testing score:', VotingClassifiers.score(X_test_tweets_dense, y_test))

In [None]:
import pickle
pickle.dump(VotingClassifiers, open('Voting_classifier.pkl', 'wb'))

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=5, random_state=42)
rf_classifier.fit(X_train_tweets_dense, y_train)

In [None]:
print('Training score:', rf_classifier.score(X_train_tweets_dense, y_train))
print('Testing score:', rf_classifier.score(X_test_tweets_dense, y_test))

In [None]:
import xgboost as xgb

xgb_classifier = xgb.XGBClassifier(n_estimators=5, random_state=42)
y_train = np.where(y_train == 0, 0, 1)
xgb_classifier.fit(X_train_tweets_dense, y_train)

In [None]:
print('Training score:', xgb_classifier.score(X_train_tweets_dense, y_train))
print('Testing score:', xgb_classifier.score(X_test_tweets_dense, y_test))

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

clf = HistGradientBoostingClassifier()
clf.fit(X_train_tweets_dense, y_train)

In [None]:
print('Training score:', clf.score(X_train_tweets_dense, y_train))
print('Testing score:', clf.score(X_test_tweets_dense, y_test))