In [1]:
import pandas as pd
import seaborn as sns
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics, preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report
import xgboost as xgb
from sklearn.linear_model import PassiveAggressiveClassifier, LogisticRegression

In [2]:
# Load dataset
df = pd.read_csv("/content/Roman Urdu DataSet.csv", header=None)

In [3]:
# Preprocess the dataset
df.reset_index(inplace=True)  # Resetting the index to give column names
df.columns = ["A", "B", "C", "D"]  # Giving some random column names
del df["A"]  # Deleting unnecessary columns
del df["D"]
df.rename(columns={'B': 'text', 'C': 'sentiment'}, inplace=True)  # Renaming columns

# Correcting misspelled sentiment labels
df['sentiment'] = df['sentiment'].str.replace('Neative', 'Negative')

# Removing Emojis
df_new = df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))

# Removing 'Neutral' values
df_removed = df_new[~df_new.sentiment.str.contains("Neutral")]

# Encode labels in column 'sentiment'
label_encoder = preprocessing.LabelEncoder()
df_removed['Pos'] = label_encoder.fit_transform(df_removed['sentiment'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_removed['Pos'] = label_encoder.fit_transform(df_removed['sentiment'])


In [4]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(df_removed.text, df_removed.Pos, test_size=0.3, random_state=33)

In [None]:
# Define pipelines for different models

# Naive Bayes with Count Vectorizer
clf_NB = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])
clf_NB.fit(X_train, y_train)

# Naive Bayes with TFIDF Vectorizer
clf_NB_TFIDF = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('nb', MultinomialNB())
])
clf_NB_TFIDF.fit(X_train, y_train)

# XGBoost Pipeline
clf_XGB = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('xg', xgb.XGBClassifier(learning_rate=0.3, max_depth=5))
])
clf_XGB.fit(X_train, y_train)

# Passive Aggressive Classifier
clf_nbc = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('pa_clf', PassiveAggressiveClassifier(max_iter=1000))
])
clf_nbc.fit(X_train, y_train)

# SVM Pipeline
clf_SVM = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('svm', svm.SVC(kernel='linear'))
])
clf_SVM.fit(X_train, y_train)

# Logistic Regression Pipeline
clf_LR = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('lr', LogisticRegression())
])
clf_LR.fit(X_train, y_train)


In [12]:
# Evaluating models' performance on test data
def evaluate_models():
    print("\nModel Performance on Test Data")

    # Naive Bayes (Count Vectorizer)
    y_pred_nb = clf_NB.predict(X_test)
    print("\nNaive Bayes (Count Vectorizer)")
    print(classification_report(y_test, y_pred_nb, target_names=label_encoder.classes_))

    # Naive Bayes (TFIDF Vectorizer)
    y_pred_nb_tfidf = clf_NB_TFIDF.predict(X_test)
    print("\nNaive Bayes (TFIDF Vectorizer)")
    print(classification_report(y_test, y_pred_nb_tfidf, target_names=label_encoder.classes_))

    # XGBoost
    y_pred_xgb = clf_XGB.predict(X_test)
    print("\nXGBoost")
    print(classification_report(y_test, y_pred_xgb, target_names=label_encoder.classes_))

    # Passive Aggressive Classifier
    y_pred_pac = clf_nbc.predict(X_test)
    print("\nPassive Aggressive Classifier")
    print(classification_report(y_test, y_pred_pac, target_names=label_encoder.classes_))

    # SVM
    y_pred_svm = clf_SVM.predict(X_test)
    print("\nSVM")
    print(classification_report(y_test, y_pred_svm, target_names=label_encoder.classes_))

    # Logistic Regression
    y_pred_lr = clf_LR.predict(X_test)
    print("\nLogistic Regression")
    print(classification_report(y_test, y_pred_lr, target_names=label_encoder.classes_))

# Call function to evaluate models
evaluate_models()


Model Performance on Test Data

Naive Bayes (Count Vectorizer)
              precision    recall  f1-score   support

    Negative       0.76      0.76      0.76      1536
    Positive       0.80      0.80      0.80      1854

    accuracy                           0.78      3390
   macro avg       0.78      0.78      0.78      3390
weighted avg       0.78      0.78      0.78      3390


Naive Bayes (TFIDF Vectorizer)
              precision    recall  f1-score   support

    Negative       0.80      0.70      0.74      1536
    Positive       0.77      0.86      0.81      1854

    accuracy                           0.78      3390
   macro avg       0.79      0.78      0.78      3390
weighted avg       0.79      0.78      0.78      3390


XGBoost
              precision    recall  f1-score   support

    Negative       0.69      0.77      0.73      1536
    Positive       0.79      0.71      0.75      1854

    accuracy                           0.74      3390
   macro avg       0.74

In [8]:
# Predicting the user input
def predict_user_input(text):
    print("\nPrediction for the given text:", text)

    pred_nb = clf_NB.predict([text])[0]
    pred_nb_tfidf = clf_NB_TFIDF.predict([text])[0]
    pred_xgb = clf_XGB.predict([text])[0]
    pred_pac = clf_nbc.predict([text])[0]
    pred_svm = clf_SVM.predict([text])[0]
    pred_lr = clf_LR.predict([text])[0]

    print("\nNaive Bayes (Count Vectorizer):", label_encoder.inverse_transform([pred_nb])[0])
    print("Naive Bayes (TFIDF Vectorizer):", label_encoder.inverse_transform([pred_nb_tfidf])[0])
    print("XGBoost:", label_encoder.inverse_transform([pred_xgb])[0])
    print("Passive Aggressive Classifier:", label_encoder.inverse_transform([pred_pac])[0])
    print("SVM:", label_encoder.inverse_transform([pred_svm])[0])
    print("Logistic Regression:", label_encoder.inverse_transform([pred_lr])[0])

# Taking input from the user
user_input = input("Enter a sentence for sentiment analysis: ")
predict_user_input(user_input)

Enter a sentence for sentiment analysis: boht bury insan ho tum

Prediction for the given text: boht bury insan ho tum

Naive Bayes (Count Vectorizer): Negative
Naive Bayes (TFIDF Vectorizer): Negative
XGBoost: Negative
Passive Aggressive Classifier: Negative
SVM: Negative
Logistic Regression: Negative
