**Combining passive aggressive classifier and weighted knn**

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load and preprocess training data for Passive Aggressive Classifier
df_train_pac = pd.read_csv('/content/drive/MyDrive/twitter/twitter_training.csv')
df_train_pac.rename(columns={'2401': 'id', 'Positive': 'Sentiment', 'im getting on borderlands and i will murder you all ,': 'text'}, inplace=True)
df_train_pac.dropna(inplace=True)
X_train_pac = df_train_pac['text']
y_train_pac = df_train_pac['Sentiment']
tfidf_vectorizer_pac = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train_pac = tfidf_vectorizer_pac.fit_transform(X_train_pac)
pac_model = PassiveAggressiveClassifier(max_iter=50)
pac_model.fit(tfidf_train_pac, y_train_pac)

# Load and preprocess test data for Passive Aggressive Classifier
df_test_pac = pd.read_csv('/content/drive/MyDrive/twitter/twitter_validation.csv')
df_test_pac.rename(columns={'3364': 'id', 'Irrelevant': 'Sentiment', 'I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣': 'text'}, inplace=True)
df_test_pac.dropna(inplace=True)
X_test_pac = df_test_pac['text']
y_test_pac = df_test_pac['Sentiment']
tfidf_test_pac = tfidf_vectorizer_pac.transform(X_test_pac)

# Load and preprocess training data for Weighted KNN
df_train_knn = pd.read_csv('/content/drive/MyDrive/twitter/twitter_training.csv')
df_train_knn.rename(columns={'2401': 'id', 'Positive': 'Sentiment', 'im getting on borderlands and i will murder you all ,': 'text'}, inplace=True)
df_train_knn.dropna(inplace=True)
X_train_knn = df_train_knn['text']
y_train_knn = df_train_knn['Sentiment']
tfidf_vectorizer_knn = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train_knn = tfidf_vectorizer_knn.fit_transform(X_train_knn)
knn_model = KNeighborsClassifier(n_neighbors=1, weights='distance')
knn_model.fit(tfidf_train_knn, y_train_knn)

# Load and preprocess test data for Weighted KNN
df_test_knn = pd.read_csv('/content/drive/MyDrive/twitter/twitter_validation.csv')
df_test_knn.rename(columns={'3364': 'id', 'Irrelevant': 'Sentiment', 'I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣': 'text'}, inplace=True)
df_test_knn.dropna(inplace=True)
X_test_knn = df_test_knn['text']
y_test_knn = df_test_knn['Sentiment']
tfidf_test_knn = tfidf_vectorizer_knn.transform(X_test_knn)

# Initialize the Voting Classifier with the two models
voting_model = VotingClassifier([('pac', pac_model), ('knn', knn_model)], voting='hard')

# Fit the Voting Classifier
voting_model.fit(tfidf_test_pac, y_test_pac)

# Predict with the Voting Classifier
y_pred_ensemble = voting_model.predict(tfidf_test_pac)

# Evaluate ensemble model
accuracy_ensemble = accuracy_score(y_test_pac, y_pred_ensemble)
print(f'Ensemble Accuracy: {round(accuracy_ensemble * 100, 2)}%')
print('Ensemble Classification Report:')
print(classification_report(y_test_pac, y_pred_ensemble))


Ensemble Accuracy: 99.9%
Ensemble Classification Report:
              precision    recall  f1-score   support

  Irrelevant       1.00      0.99      1.00       171
    Negative       1.00      1.00      1.00       266
     Neutral       1.00      1.00      1.00       285
    Positive       1.00      1.00      1.00       277

    accuracy                           1.00       999
   macro avg       1.00      1.00      1.00       999
weighted avg       1.00      1.00      1.00       999



**combining pac and xg boost**

In [None]:
pip install xgboost



**passive and random forest using voting ensemble model**

In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Load and preprocess training data for Passive Aggressive Classifier
df_train_pac = pd.read_csv('/content/drive/MyDrive/twitter/twitter_training.csv')
df_train_pac.rename(columns={'2401': 'id', 'Positive': 'Label', 'im getting on borderlands and i will murder you all ,': 'text'}, inplace=True)
df_train_pac.dropna(inplace=True)
X_train_pac = df_train_pac['text']
y_train_pac = df_train_pac['Label']
tfidf_vectorizer_pac = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train_pac = tfidf_vectorizer_pac.fit_transform(X_train_pac)
pac_model = PassiveAggressiveClassifier(max_iter=50)
pac_model.fit(tfidf_train_pac, y_train_pac)

# Load and preprocess test data for Passive Aggressive Classifier
df_test_pac = pd.read_csv('/content/drive/MyDrive/twitter/twitter_validation.csv')
df_test_pac.rename(columns={'3364': 'id', 'Irrelevant': 'Label', 'I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣': 'text'}, inplace=True)
df_test_pac.dropna(inplace=True)
X_test_pac = df_test_pac['text']
y_test_pac = df_test_pac['Label']
tfidf_test_pac = tfidf_vectorizer_pac.transform(X_test_pac)
y_pred_pac = pac_model.predict(tfidf_test_pac)

# Load and preprocess training data for Random Forest Classifier
df_train_rfc = pd.read_csv('/content/drive/MyDrive/twitter/twitter_training.csv')
df_train_rfc.rename(columns={'2401': 'id', 'Positive': 'Label', 'im getting on borderlands and i will murder you all ,': 'text'}, inplace=True)
df_train_rfc.dropna(inplace=True)
X_train_rfc = df_train_rfc['text']
y_train_rfc = df_train_rfc['Label']
tfidf_vectorizer_rfc = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train_rfc = tfidf_vectorizer_rfc.fit_transform(X_train_rfc)
rfc_model = RandomForestClassifier()
rfc_model.fit(tfidf_train_rfc, y_train_rfc)

# Load and preprocess test data for Random Forest Classifier
df_test_rfc = pd.read_csv('/content/drive/MyDrive/twitter/twitter_validation.csv')
df_test_rfc.rename(columns={'3364': 'id', 'Irrelevant': 'Label', 'I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣': 'text'}, inplace=True)
df_test_rfc.dropna(inplace=True)
X_test_rfc = df_test_rfc['text']
y_test_rfc = df_test_rfc['Label']
tfidf_test_rfc = tfidf_vectorizer_rfc.transform(X_test_rfc)
y_pred_rfc = rfc_model.predict(tfidf_test_rfc)

# Create a voting ensemble model
voting_model = VotingClassifier([('pac', pac_model), ('rfc', rfc_model)], voting='hard')

# Fit the voting ensemble model
voting_model.fit(tfidf_test_pac, y_test_pac)

# Predict with the voting ensemble model
y_pred_ensemble = voting_model.predict(tfidf_test_pac)

# Evaluate ensemble model
accuracy_ensemble = accuracy_score(y_test_pac, y_pred_ensemble)
print(f'Ensemble Accuracy: {round(accuracy_ensemble * 100, 2)}%')
print('Ensemble Classification Report:')
print(classification_report(y_test_pac, y_pred_ensemble))


Ensemble Accuracy: 99.9%
Ensemble Classification Report:
              precision    recall  f1-score   support

  Irrelevant       0.99      1.00      1.00       171
    Negative       1.00      1.00      1.00       266
     Neutral       1.00      1.00      1.00       285
    Positive       1.00      1.00      1.00       277

    accuracy                           1.00       999
   macro avg       1.00      1.00      1.00       999
weighted avg       1.00      1.00      1.00       999



**using pac and random forest with stacking ensemble model**


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import make_pipeline

# Load and preprocess training data
df_train = pd.read_csv('/content/drive/MyDrive/twitter/twitter_training.csv')
df_train.rename(columns={'2401': 'id', 'Positive': 'Label', 'im getting on borderlands and i will murder you all ,': 'text'}, inplace=True)
df_train.dropna(inplace=True)

X_train = df_train['text']
y_train = df_train['Label']

# Load and preprocess test data
df_test = pd.read_csv('/content/drive/MyDrive/twitter/twitter_validation.csv')
df_test.rename(columns={'3364': 'id', 'Irrelevant': 'Label', 'I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣': 'text'}, inplace=True)
df_test.dropna(inplace=True)

X_test = df_test['text']
y_test = df_test['Label']

# Define base models
pac_model = make_pipeline(TfidfVectorizer(stop_words='english', max_df=0.7), PassiveAggressiveClassifier(max_iter=50))
rfc_model = make_pipeline(TfidfVectorizer(stop_words='english', max_df=0.7), RandomForestClassifier())

# Define meta-model
meta_model = RandomForestClassifier()

# Define stacking ensemble model
stacking_model = StackingClassifier([('pac', pac_model), ('rfc', rfc_model)], final_estimator=meta_model)

# Fit the stacking ensemble model
stacking_model.fit(X_train, y_train)

# Evaluate ensemble model
y_pred_stacking = stacking_model.predict(X_test)
accuracy_stacking = accuracy_score(y_test, y_pred_stacking)
print(f'Stacking Ensemble Accuracy: {round(accuracy_stacking * 100, 2)}%')
print('Stacking Ensemble Classification Report:')
print(classification_report(y_test, y_pred_stacking))


Stacking Ensemble Accuracy: 81.28%
Stacking Ensemble Classification Report:
              precision    recall  f1-score   support

  Irrelevant       0.88      0.64      0.74       171
    Negative       0.74      0.78      0.76       266
     Neutral       0.81      0.87      0.84       285
    Positive       0.87      0.89      0.88       277

    accuracy                           0.81       999
   macro avg       0.82      0.80      0.80       999
weighted avg       0.82      0.81      0.81       999



**voting ensemble model of weighted knn and random forest**

In [None]:
import pandas as pd
import numpy as np
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load and preprocess training data
df_train = pd.read_csv('/content/drive/MyDrive/twitter/twitter_training.csv')
df_train.rename(columns={'2401': 'id',
                         'Positive': 'Sentiment',
                         'im getting on borderlands and i will murder you all ,': 'text'}, inplace=True)
df_train.dropna(inplace=True)

# Load and preprocess test data
df_test = pd.read_csv('/content/drive/MyDrive/twitter/twitter_validation.csv')
df_test.rename(columns={'3364': 'id',
                        'Irrelevant': 'Sentiment',
                        'I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣': 'text'}, inplace=True)
df_test.dropna(inplace=True)

# Splitting data
X_train = df_train['text']
y_train = df_train['Sentiment']
X_test = df_test['text']
y_test = df_test['Sentiment']

# Vectorize text data
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

# Initialize classifiers
knn_model = KNeighborsClassifier(n_neighbors=1, weights='distance')
rfc_model = RandomForestClassifier()

# Fit classifiers
knn_model.fit(tfidf_train, y_train)
rfc_model.fit(tfidf_train, y_train)

# Predict using the classifiers
y_pred_knn = knn_model.predict(tfidf_test)
y_pred_rfc = rfc_model.predict(tfidf_test)

# Create a voting ensemble classifier
voting_clf = VotingClassifier(estimators=[('knn', knn_model), ('rfc', rfc_model)], voting='hard')

# Fit the voting ensemble classifier
voting_clf.fit(tfidf_train, y_train)

# Predict using the voting ensemble classifier
y_pred_voting = voting_clf.predict(tfidf_test)

# Calculate accuracy for each classifier and the ensemble
accuracy_knn = accuracy_score(y_test, y_pred_knn)
accuracy_rfc = accuracy_score(y_test, y_pred_rfc)
accuracy_voting = accuracy_score(y_test, y_pred_voting)

# Display results
print("Accuracy for KNN:", round(accuracy_knn * 100, 2), "%")
print("Accuracy for RFC:", round(accuracy_rfc * 100, 2), "%")
print("Accuracy for Voting Ensemble:", round(accuracy_voting * 100, 2), "%")

# Display classification report for the voting ensemble
print('Classification Report for Voting Ensemble:')
print(classification_report(y_test, y_pred_voting))


Accuracy for KNN: 96.8 %
Accuracy for RFC: 96.0 %
Accuracy for Voting Ensemble: 97.2 %
Classification Report for Voting Ensemble:
              precision    recall  f1-score   support

  Irrelevant       0.97      0.99      0.98       171
    Negative       0.95      0.98      0.97       266
     Neutral       0.99      0.96      0.98       285
    Positive       0.99      0.95      0.97       277

    accuracy                           0.97       999
   macro avg       0.97      0.97      0.97       999
weighted avg       0.97      0.97      0.97       999



**voting ensemble model of knn randomforest and pac**

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import spacy

# Load and preprocess training data for PAC
df_train_pac = pd.read_csv('/content/drive/MyDrive/twitter/twitter_training.csv')
df_train_pac.rename(columns={'2401': 'id', 'Positive': 'Label', 'im getting on borderlands and i will murder you all ,': 'text'}, inplace=True)
df_train_pac.dropna(inplace=True)
X_train_pac = df_train_pac['text']
y_train_pac = df_train_pac['Label']
tfidf_vectorizer_pac = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train_pac = tfidf_vectorizer_pac.fit_transform(X_train_pac)
pac_model = PassiveAggressiveClassifier(max_iter=50)
pac_model.fit(tfidf_train_pac, y_train_pac)

# Load and preprocess test data for PAC
df_test_pac = pd.read_csv('/content/drive/MyDrive/twitter/twitter_validation.csv')
df_test_pac.rename(columns={'3364': 'id', 'Irrelevant': 'Label', 'I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣': 'text'}, inplace=True)
df_test_pac.dropna(inplace=True)
X_test_pac = df_test_pac['text']
y_test_pac = df_test_pac['Label']
tfidf_test_pac = tfidf_vectorizer_pac.transform(X_test_pac)

# Load and preprocess training data for Weighted KNN
df_train_knn = pd.read_csv('/content/drive/MyDrive/twitter/twitter_training.csv')
df_train_knn.rename(columns={'2401': 'id', 'Positive': 'Sentiment', 'im getting on borderlands and i will murder you all ,': 'text'}, inplace=True)
df_train_knn.dropna(inplace=True)
X_train_knn = df_train_knn['text']
y_train_knn = df_train_knn['Sentiment']
tfidf_vectorizer_knn = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train_knn = tfidf_vectorizer_knn.fit_transform(X_train_knn)
knn_model = KNeighborsClassifier(n_neighbors=1, weights='distance')
knn_model.fit(tfidf_train_knn, y_train_knn)

# Load and preprocess test data for Weighted KNN
df_test_knn = pd.read_csv('/content/drive/MyDrive/twitter/twitter_validation.csv')
df_test_knn.rename(columns={'3364': 'id', 'Irrelevant': 'Sentiment', 'I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣': 'text'}, inplace=True)
df_test_knn.dropna(inplace=True)
X_test_knn = df_test_knn['text']
y_test_knn = df_test_knn['Sentiment']
tfidf_test_knn = tfidf_vectorizer_knn.transform(X_test_knn)

# Load and preprocess training data for Random Forest Classifier
df_train_rfc = pd.read_csv('/content/drive/MyDrive/twitter/twitter_training.csv')
df_train_rfc.rename(columns={'2401': 'id', 'Positive': 'Label', 'im getting on borderlands and i will murder you all ,': 'text'}, inplace=True)
df_train_rfc.dropna(inplace=True)
X_train_rfc = df_train_rfc['text']
y_train_rfc = df_train_rfc['Label']
tfidf_vectorizer_rfc = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train_rfc = tfidf_vectorizer_rfc.fit_transform(X_train_rfc)
rfc_model = RandomForestClassifier()
rfc_model.fit(tfidf_train_rfc, y_train_rfc)

# Load and preprocess test data for Random Forest Classifier
df_test_rfc = pd.read_csv('/content/drive/MyDrive/twitter/twitter_validation.csv')
df_test_rfc.rename(columns={'3364': 'id', 'Irrelevant': 'Label', 'I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣': 'text'}, inplace=True)
df_test_rfc.dropna(inplace=True)
X_test_rfc = df_test_rfc['text']
y_test_rfc = df_test_rfc['Label']
tfidf_test_rfc = tfidf_vectorizer_rfc.transform(X_test_rfc)

# Create a Voting Classifier
voting_model = VotingClassifier([('pac', pac_model), ('knn', knn_model), ('rfc', rfc_model)], voting='hard')

# Fit the Voting Classifier
X_train_ensemble = pd.concat([X_train_pac, X_train_knn, X_train_rfc])
y_train_ensemble = pd.concat([y_train_pac, y_train_knn, y_train_rfc])
tfidf_train_ensemble = tfidf_vectorizer_pac.transform(X_train_ensemble)
voting_model.fit(tfidf_train_ensemble, y_train_ensemble)

# Predict with the Voting Classifier
X_test_ensemble = pd.concat([X_test_pac, X_test_knn, X_test_rfc])
tfidf_test_ensemble = tfidf_vectorizer_pac.transform(X_test_ensemble)
y_pred_ensemble = voting_model.predict(tfidf_test_ensemble)

# Evaluate ensemble model
accuracy_ensemble = accuracy_score(pd.concat([y_test_pac, y_test_knn, y_test_rfc]), y_pred_ensemble)
print(f'Ensemble Accuracy: {round(accuracy_ensemble * 100, 2)}%')
print('Ensemble Classification Report:')
print(classification_report(pd.concat([y_test_pac, y_test_knn, y_test_rfc]), y_pred_ensemble))


Ensemble Accuracy: 98.4%
Ensemble Classification Report:
              precision    recall  f1-score   support

  Irrelevant       0.99      0.99      0.99       513
    Negative       0.97      0.98      0.98       798
     Neutral       1.00      0.98      0.99       855
    Positive       0.98      0.99      0.99       831

    accuracy                           0.98      2997
   macro avg       0.98      0.98      0.98      2997
weighted avg       0.98      0.98      0.98      2997



In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# Load and preprocess training data for Passive Aggressive Classifier
df_train_pac = pd.read_csv('/content/drive/MyDrive/twitter/twitter_training.csv')
df_train_pac.rename(columns={'2401': 'id', 'Positive': 'Label', 'im getting on borderlands and i will murder you all ,': 'text'}, inplace=True)
df_train_pac.dropna(inplace=True)
X_train_pac = df_train_pac['text']
y_train_pac = df_train_pac['Label']
tfidf_vectorizer_pac = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train_pac = tfidf_vectorizer_pac.fit_transform(X_train_pac)
pac_model = PassiveAggressiveClassifier(max_iter=50)
pac_model.fit(tfidf_train_pac, y_train_pac)

# Load and preprocess test data for Passive Aggressive Classifier
df_test_pac = pd.read_csv('/content/drive/MyDrive/twitter/twitter_validation.csv')
df_test_pac.rename(columns={'3364': 'id', 'Irrelevant': 'Label', 'I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣': 'text'}, inplace=True)
df_test_pac.dropna(inplace=True)
X_test_pac = df_test_pac['text']
y_test_pac = df_test_pac['Label']
tfidf_test_pac = tfidf_vectorizer_pac.transform(X_test_pac)

# Load and preprocess training data for Weighted k-NN
df_train_knn = pd.read_csv('/content/drive/MyDrive/twitter/twitter_training.csv')
df_train_knn.rename(columns={'2401': 'id', 'Positive': 'Sentiment', 'im getting on borderlands and i will murder you all ,': 'text'}, inplace=True)
df_train_knn.dropna(inplace=True)
X_train_knn = df_train_knn['text']
y_train_knn = df_train_knn['Sentiment']
tfidf_vectorizer_knn = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train_knn = tfidf_vectorizer_knn.fit_transform(X_train_knn)
knn_model = KNeighborsClassifier(n_neighbors=1, weights='distance')
knn_model.fit(tfidf_train_knn, y_train_knn)

# Load and preprocess test data for Weighted k-NN
df_test_knn = pd.read_csv('/content/drive/MyDrive/twitter/twitter_validation.csv')
df_test_knn.rename(columns={'3364': 'id', 'Irrelevant': 'Sentiment', 'I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣': 'text'}, inplace=True)
df_test_knn.dropna(inplace=True)
X_test_knn = df_test_knn['text']
y_test_knn = df_test_knn['Sentiment']
tfidf_test_knn = tfidf_vectorizer_knn.transform(X_test_knn)

# Load and preprocess training data for Random Forest Classifier
df_train_rfc = pd.read_csv('/content/drive/MyDrive/twitter/twitter_training.csv')
df_train_rfc.rename(columns={'2401': 'id', 'Positive': 'Label', 'im getting on borderlands and i will murder you all ,': 'text'}, inplace=True)
df_train_rfc.dropna(inplace=True)
X_train_rfc = df_train_rfc['text']
y_train_rfc = df_train_rfc['Label']
tfidf_vectorizer_rfc = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train_rfc = tfidf_vectorizer_rfc.fit_transform(X_train_rfc)
rfc_model = RandomForestClassifier()
rfc_model.fit(tfidf_train_rfc, y_train_rfc)

# Load and preprocess test data for Random Forest Classifier
df_test_rfc = pd.read_csv('/content/drive/MyDrive/twitter/twitter_validation.csv')
df_test_rfc.rename(columns={'3364': 'id', 'Irrelevant': 'Label', 'I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣': 'text'}, inplace=True)
df_test_rfc.dropna(inplace=True)
X_test_rfc = df_test_rfc['text']
y_test_rfc = df_test_rfc['Label']
tfidf_test_rfc = tfidf_vectorizer_rfc.transform(X_test_rfc)

# Load and preprocess training data for XGBoost Classifier
df_train_xgb = pd.read_csv('/content/drive/MyDrive/twitter/twitter_training.csv')
df_train_xgb.rename(columns={'2401': 'id', 'Positive': 'Label', 'im getting on borderlands and i will murder you all ,': 'text'}, inplace=True)
df_train_xgb.dropna(inplace=True)
X_train_xgb = df_train_xgb['text']
y_train_xgb = df_train_xgb['Label']
tfidf_vectorizer_xgb = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train_xgb = tfidf_vectorizer_xgb.fit_transform(X_train_xgb)
label_encoder_xgb = LabelEncoder()
y_train_xgb_encoded = label_encoder_xgb.fit_transform(y_train_xgb)
xgb_model = xgb.XGBClassifier()
xgb_model.fit(tfidf_train_xgb, y_train_xgb_encoded)

# Load and preprocess test data for XGBoost Classifier
df_test_xgb = pd.read_csv('/content/drive/MyDrive/twitter/twitter_validation.csv')
df_test_xgb.rename(columns={'3364': 'id', 'Irrelevant': 'Label', 'I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣': 'text'}, inplace=True)
df_test_xgb.dropna(inplace=True)
X_test_xgb = df_test_xgb['text']
y_test_xgb = df_test_xgb['Label']
tfidf_test_xgb = tfidf_vectorizer_xgb.transform(X_test_xgb)

# Make predictions using each model
pac_pred = pac_model.predict(tfidf_test_pac)
knn_pred = knn_model.predict(tfidf_test_knn)
rfc_pred = rfc_model.predict(tfidf_test_rfc)
xgb_pred = xgb_model.predict(tfidf_test_xgb)

# Create a voting ensemble classifier
voting_clf = VotingClassifier(estimators=[
    ('pac', pac_model), ('knn', knn_model), ('rfc', rfc_model), ('xgb', xgb_model)], voting='hard')

# Fit the voting ensemble classifier
voting_clf.fit(tfidf_train_pac, y_train_pac)

# Make predictions using the voting ensemble classifier
voting_pred = voting_clf.predict(tfidf_test_pac)

# Evaluate the ensemble model's performance
accuracy_voting = accuracy_score(y_test_pac, voting_pred)
print(f'Voting Classifier Ensemble Accuracy: {accuracy_voting * 100:.2f}%')
print('Classification Report for Voting Classifier Ensemble:')
print(classification_report(y_test_pac, voting_pred))


Voting Classifier Ensemble Accuracy: 97.70%
Classification Report for Voting Classifier Ensemble:
              precision    recall  f1-score   support

  Irrelevant       0.98      0.98      0.98       171
    Negative       0.96      0.99      0.97       266
     Neutral       0.98      0.97      0.98       285
    Positive       0.99      0.97      0.98       277

    accuracy                           0.98       999
   macro avg       0.98      0.98      0.98       999
weighted avg       0.98      0.98      0.98       999

