In [13]:
import pandas as pd
import pandas as pd

# Function to read a CSV file with different encoding
def read_csv_with_encoding(file_path, encoding='utf-8'):
    try:
        return pd.read_csv(file_path, encoding=encoding)
    except UnicodeDecodeError:
        # Try a different encoding if UTF-8 doesn't work
        return pd.read_csv(file_path, encoding='ISO-8859-1')

# Step 1: Read the files with encoding handling and combine them into one dataframe
test_df = read_csv_with_encoding('test.csv')
train_df = read_csv_with_encoding('train.csv')
dev_df = read_csv_with_encoding('dev.csv')



combined_df = pd.concat([test_df, train_df, dev_df])

combined_df = combined_df[combined_df['human_ans_spans'] != 'ANSWERNOTFOUND']

combined_df['subjectivity'] = combined_df['answer_subj_level'].apply(lambda x: 0 if x == 1 else 1)

new_df = combined_df[['human_ans_spans', 'subjectivity']].rename(columns={'human_ans_spans': 'text'})

print(new_df.head())


                                                 text  subjectivity
5   it brought together the characters endings in ...             1
7             Being a person who rarely reads fantasy             0
14                          What a funny , funny read             0
17                                               good             1
19                          It is a whirlwind romance             0


In [29]:
new_df.to_csv('new_df_head.csv', index=False)


In [50]:
import pandas as pd

df = pd.read_csv('balanced_reviews_with_enhanced_subjectivity.csv')


In [51]:
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

def clean_text(text):
    # Lowercasing
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenization and removing stopwords
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    return words

from nltk.stem import PorterStemmer
ps = PorterStemmer()
def stem_words(words):
    return [ps.stem(word) for word in words]

df['cleaned_text'] = df['reviewText'].apply(clean_text)
df['stemmed_text'] = df['cleaned_text'].apply(stem_words)


In [11]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


# embeddings 

## Word2Vec

In [60]:
from gensim.models import Word2Vec
import numpy as np

sentences = df['stemmed_text'].tolist()

# Train Word2Vec model
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Function to vectorize a sentence based on the Word2Vec model
def vectorize_sentence(sentence, model):
    word_vectors = [model.wv[word] for word in sentence if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

# Vectorize each document
X_word2vec = np.array([vectorize_sentence(sentence, word2vec_model) for sentence in sentences])


## TFIDF

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Joining the stemmed tokens back into sentences
df['stemmed_text_joined'] = df['stemmed_text'].apply(lambda x: ' '.join(x))

tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['stemmed_text_joined'])


## Glove2

In [62]:
import numpy as np

def vectorize_text_glove(text, glove_model):
    word_vectors = [glove_model[word] for word in text if word in glove_model]
    if len(word_vectors) == 0:
        return np.zeros(25)  # 25 is the dimension of glove-twitter-25 embeddings
    return np.mean(word_vectors, axis=0)

# Apply this function to your stemmed text
X_glove = np.array([vectorize_text_glove(text, glove_model) for text in df['stemmed_text']])


## Logistic Regression
### Word2vec

In [64]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

# Assuming 'y' is your target variable
y = df['enhanced_subjectivity']

# Split data for Word2Vec
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(X_word2vec, y, test_size=0.2, random_state=42)

# Logistic Regression with GridSearchCV for Word2Vec
param_grid = {'C': [0.1, 1, 10], 'solver': ['liblinear', 'lbfgs']}
grid_w2v = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_w2v.fit(X_train_w2v, y_train_w2v)

# Evaluation for Word2Vec
y_pred_w2v = grid_w2v.predict(X_test_w2v)



# Logistic Regression with GridSearchCV for TF-IDF
grid_tfidf = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_tfidf.fit(X_train_tfidf, y_train_tfidf)

# Evaluation for TF-IDF
y_pred_tfidf = grid_tfidf.predict(X_test_tfidf)




# glove 
grid_glove = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_glove.fit(X_train_glove, y_train_glove)

# Best parameters and evaluation
print("Best Parameters:", grid_glove.best_params_)
y_pred_glove_tuned = grid_glove.predict(X_test_glove)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Word2Vec + Logistic Regression:
              precision    recall  f1-score   support

         0.0       0.70      0.87      0.77      2307
         1.0       0.83      0.63      0.71      2333

    accuracy                           0.75      4640
   macro avg       0.76      0.75      0.74      4640
weighted avg       0.76      0.75      0.74      4640



## Logistic Regression Results:


In [66]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Function to calculate metrics
def get_metrics(y_true, y_pred):
    return {
        'Accuracy (%)': accuracy_score(y_true, y_pred) * 100,
        'Precision (%)': precision_score(y_true, y_pred, average='weighted') * 100,
        'Recall (%)': recall_score(y_true, y_pred, average='weighted') * 100,
        'F1 score (%)': f1_score(y_true, y_pred, average='weighted') * 100,
        'AUC (%)': roc_auc_score(pd.get_dummies(y_true), pd.get_dummies(y_pred), multi_class='ovo') * 100
    }

# Results for Word2Vec
metrics_w2v = get_metrics(y_test_w2v, y_pred_w2v)

# Results for TF-IDF
metrics_tfidf = get_metrics(y_test_tfidf, y_pred_tfidf)

# Results for GloVe
metrics_glove = get_metrics(y_test_glove, y_pred_glove_tuned)

# Combine results into a DataFrame
results_df = pd.DataFrame([metrics_w2v, metrics_tfidf, metrics_glove], 
                          index=['Lr+Word2Vec', 'Lr+TF-IDF', 'Lr+GloVe'])

print(results_df)


          Accuracy (%)  Precision (%)  Recall (%)  F1 score (%)    AUC (%)
Word2Vec     74.741379      76.303937   74.741379     74.385835  74.808216
TF-IDF       88.879310      89.029241   88.879310     88.870797  88.896110
GloVe        66.487069      66.593254   66.487069     66.447443  66.506909


## Random Forest

In [67]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


# Hyperparameter grid
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Random Forest with GridSearchCV for Word2Vec
grid_rf_w2v = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5)
grid_rf_w2v.fit(X_train_w2v, y_train_w2v)
y_pred_rf_w2v = grid_rf_w2v.predict(X_test_w2v)
metrics_rf_w2v = get_metrics(y_test_w2v, y_pred_rf_w2v)

# Random Forest with GridSearchCV for TF-IDF
grid_rf_tfidf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5)
grid_rf_tfidf.fit(X_train_tfidf, y_train_tfidf)
y_pred_rf_tfidf = grid_rf_tfidf.predict(X_test_tfidf)
metrics_rf_tfidf = get_metrics(y_test_tfidf, y_pred_rf_tfidf)

# Random Forest with GridSearchCV for GloVe
grid_rf_glove = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5)
grid_rf_glove.fit(X_train_glove, y_train_glove)
y_pred_rf_glove = grid_rf_glove.predict(X_test_glove)
metrics_rf_glove = get_metrics(y_test_glove, y_pred_rf_glove)

# Combine results into a DataFrame
results_rf_df = pd.DataFrame([metrics_rf_w2v, metrics_rf_tfidf, metrics_rf_glove], 
                             index=['RF + Word2Vec', 'RF + TF-IDF', 'RF + GloVe'])

print(results_rf_df)


               Accuracy (%)  Precision (%)  Recall (%)  F1 score (%)  \
RF + Word2Vec     74.159483      74.669191   74.159483     74.040848   
RF + TF-IDF       89.288793      89.317545   89.288793     89.287731   
RF + GloVe        70.000000      70.269034   70.000000     69.916386   

                 AUC (%)  
RF + Word2Vec  74.198162  
RF + TF-IDF    89.295824  
RF + GloVe     70.030216  


## XG boost

In [68]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pandas as pd


# Hyperparameter grid
param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.5, 0.75, 1]
}

# XGBoost with GridSearchCV for Word2Vec
grid_xgb_w2v = GridSearchCV(xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'), param_grid_xgb, cv=5)
grid_xgb_w2v.fit(X_train_w2v, y_train_w2v)
y_pred_xgb_w2v = grid_xgb_w2v.predict(X_test_w2v)
metrics_xgb_w2v = get_metrics(y_test_w2v, y_pred_xgb_w2v)

# XGBoost with GridSearchCV for TF-IDF
grid_xgb_tfidf = GridSearchCV(xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'), param_grid_xgb, cv=5)
grid_xgb_tfidf.fit(X_train_tfidf, y_train_tfidf)
y_pred_xgb_tfidf = grid_xgb_tfidf.predict(X_test_tfidf)
metrics_xgb_tfidf = get_metrics(y_test_tfidf, y_pred_xgb_tfidf)

# XGBoost with GridSearchCV for GloVe
grid_xgb_glove = GridSearchCV(xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'), param_grid_xgb, cv=5)
grid_xgb_glove.fit(X_train_glove, y_train_glove)
y_pred_xgb_glove = grid_xgb_glove.predict(X_test_glove)
metrics_xgb_glove = get_metrics(y_test_glove, y_pred_xgb_glove)

# Combine results into a DataFrame
results_xgb_df = pd.DataFrame([metrics_xgb_w2v, metrics_xgb_tfidf, metrics_xgb_glove], 
                             index=['XGB + Word2Vec', 'XGB + TF-IDF', 'XGB + GloVe'])

print(results_xgb_df)










































                Accuracy (%)  Precision (%)  Recall (%)  F1 score (%)  \
XGB + Word2Vec     74.137931      74.460993   74.137931     74.064662   
XGB + TF-IDF       88.728448      88.944506   88.728448     88.715454   
XGB + GloVe        70.474138      70.547882   70.474138     70.455587   

                  AUC (%)  
XGB + Word2Vec  74.168519  
XGB + TF-IDF    88.748746  
XGB + GloVe     70.488911  


## SVM

In [70]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pandas as pd

# Function to calculate metrics
def get_metrics(y_true, y_pred):
    return {
        'Accuracy (%)': accuracy_score(y_true, y_pred) * 100,
        'Precision (%)': precision_score(y_true, y_pred, average='weighted') * 100,
        'Recall (%)': recall_score(y_true, y_pred, average='weighted') * 100,
        'F1 score (%)': f1_score(y_true, y_pred, average='weighted') * 100,
        'AUC (%)': roc_auc_score(pd.get_dummies(y_true), pd.get_dummies(y_pred), multi_class='ovo') * 100
    }

# Hyperparameter grid for SVM
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

# SVM with GridSearchCV for Word2Vec
grid_svm_w2v = GridSearchCV(SVC(), param_grid_svm, cv=5)
grid_svm_w2v.fit(X_train_w2v, y_train_w2v)
y_pred_svm_w2v = grid_svm_w2v.predict(X_test_w2v)
metrics_svm_w2v = get_metrics(y_test_w2v, y_pred_svm_w2v)

# SVM with GridSearchCV for TF-IDF
grid_svm_tfidf = GridSearchCV(SVC(), param_grid_svm, cv=5)
grid_svm_tfidf.fit(X_train_tfidf, y_train_tfidf)
y_pred_svm_tfidf = grid_svm_tfidf.predict(X_test_tfidf)
metrics_svm_tfidf = get_metrics(y_test_tfidf, y_pred_svm_tfidf)

# SVM with GridSearchCV for GloVe
grid_svm_glove = GridSearchCV(SVC(), param_grid_svm, cv=5)
grid_svm_glove.fit(X_train_glove, y_train_glove)
y_pred_svm_glove = grid_svm_glove.predict(X_test_glove)
metrics_svm_glove = get_metrics(y_test_glove, y_pred_svm_glove)

# Combine results into a DataFrame
results_svm_df = pd.DataFrame([metrics_svm_w2v, metrics_svm_tfidf, metrics_svm_glove], 
                             index=['SVM + Word2Vec', 'SVM + TF-IDF', 'SVM + GloVe'])

print(results_svm_df)


                Accuracy (%)  Precision (%)  Recall (%)  F1 score (%)  \
SVM + Word2Vec     74.762931      78.251044   74.762931     73.994676   
SVM + TF-IDF       89.913793      90.062997   89.913793     89.906312   
SVM + GloVe        71.767241      71.972224   71.767241     71.713333   

                  AUC (%)  
SVM + Word2Vec  74.860081  
SVM + TF-IDF    89.930384  
SVM + GloVe     71.792441  


In [None]:
## CNN

In [None]:
pip install tensorflow


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Parameters
max_length = 100  # Length of input sequences
vocab_size = 10000  # Size of the vocabulary
embedding_dim = 100  # Dimension of Word2Vec embeddings

# Tokenize and pad the sequences
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(df['stemmed_text_joined'])
sequences = tokenizer.texts_to_sequences(df['stemmed_text_joined'])
padded_sequences = pad_sequences(sequences, maxlen=max_length)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, y, test_size=0.2, random_state=42)

# Building the CNN model
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))


In [1]:
from sklearn.metrics import classification_report

# Predictions
y_pred_cnn = model.predict(X_test)
y_pred_cnn_classes = np.round(y_pred_cnn).astype(int)  # Adjust based on your classification task

# Evaluation
print("CNN Model Evaluation:")
print(classification_report(y_test, y_pred_cnn_classes))


NameError: name 'model' is not defined

In [None]:
## LSTM

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

# Parameters
max_length = 100  # Length of input sequences
vocab_size = 10000  # Size of the vocabulary
embedding_dim = 100  # Dimension of Word2Vec embeddings

# Tokenize and pad the sequences
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(df['stemmed_text_joined'])
sequences = tokenizer.texts_to_sequences(df['stemmed_text_joined'])
padded_sequences = pad_sequences(sequences, maxlen=max_length)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, y, test_size=0.2, random_state=42)

# Building the LSTM model
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Use 'softmax' for multi-class classification

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))


In [None]:
# Predictions
y_pred_lstm = model.predict(X_test)
y_pred_lstm_classes = np.round(y_pred_lstm).astype(int)  # Adjust based on your classification task

# Evaluation
print("LSTM Model Evaluation:")
print(classification_report(y_test, y_pred_lstm_classes))
