### Checking the related libs are installed successfully

In [None]:
import sklearn
import numpy
import scipy
import pandas

print("Scikit-Learn version:", sklearn.__version__)
print("NumPy version:", numpy.__version__)
print("SciPy version:", scipy.__version__)
print("Pandas version:", pandas.__version__)

### Merging three assigned txt files into one CSV file

In [None]:
import pandas as pd

def merge_and_save_sentiments(file_list, output_csv):
    """
    Merges text files containing sentiment labelled sentences into a CSV file.
    Each text file is expected to have sentences separated by tabs from their sentiment scores,
    with no header row in the files.
    """
    # List to store data from each file
    data_frames = []

    for file_path in file_list:
        # Read each file into a DataFrame
        temp_df = pd.read_csv(file_path, sep='\t', header=None, names=['sentence', 'score'])
        data_frames.append(temp_df)

    # Concatenate all DataFrames
    merged_df = pd.concat(data_frames, ignore_index=True)

    # Save the merged DataFrame to a CSV file
    merged_df.to_csv(output_csv, index=False)

    print(f"Merged data saved to {output_csv}. Total sentences: {len(merged_df)}.")

# File paths (Please adjust the paths according to your directory structure)
file_list = [
    'sentiment_labelled_sentences/amazon_cells_labelled.txt',
    'sentiment_labelled_sentences/imdb_labelled.txt',
    'sentiment_labelled_sentences/yelp_labelled.txt'
]

# Output CSV file path
output_csv = 'merged_sentiment_data.csv'

# Call the function with the specified file paths
merge_and_save_sentiments(file_list, output_csv)


### Text Processing and Feature Extraction for Sentiment Analysis
* remove English common stop words
* remove punctuation and numbers

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import re

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Initialize the PorterStemmer and English stop words
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def stem_tokenize(text):
    # Convert to lowercase and remove punctuation and numbers
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    # Tokenization and stemming
    tokens = word_tokenize(text)
    filtered_tokens = [w for w in tokens if not w in stop_words]
    stems = [stemmer.stem(item) for item in filtered_tokens]
    return stems

# Feature extraction using CountVectorizer
vectorizer = CountVectorizer(tokenizer=stem_tokenize)
data_path = 'merged_sentiment_data.csv'
df = pd.read_csv(data_path)

# Separate features and labels
X = vectorizer.fit_transform(df['sentence'])
y = df['score']

FV1 = {'feature_vector': X, 'labels': y}
print(FV1['feature_vector'].shape)
print(FV1['labels'].shape)



### Feature Selection

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
import numpy as np

# Initialize the TfidfVectorizer with a custom tokenizer
tfidf_vectorizer = TfidfVectorizer(tokenizer=stem_tokenize)

# Extract features using TF-IDF method
X_tfidf = tfidf_vectorizer.fit_transform(df['sentence'])

# Continue using 'y' from the dataframe as labels
y = df['score']

# Perform feature selection to retain the top 512 most important features
selector = SelectKBest(chi2, k=512)
X_selected = selector.fit_transform(X_tfidf, y)

# Retrieve the names of the selected features
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())
selected_feature_names = feature_names[selector.get_support()]

# Store the selected feature vector, labels, and names of the selected features in FV2 for later use
FV2 = {'feature_vector': X_selected, 'labels': y, 'feature_names': selected_feature_names}

# Print the shapes of the feature vector and labels to verify
print(FV2['feature_vector'].shape)
print(FV2['labels'].shape)


### Data Splitting

In [None]:
from sklearn.model_selection import train_test_split

# Splitting the data for FV1
# This splits the data into training and temporary sets (60% training, 40% temp)
X_train_fv1, X_temp_fv1, y_train_fv1, y_temp_fv1 = train_test_split(FV1['feature_vector'], FV1['labels'], test_size=0.4, random_state=42)
# Further splitting the temporary set equally into validation and test sets
X_test_fv1, X_val_fv1, y_test_fv1, y_val_fv1 = train_test_split(X_temp_fv1, y_temp_fv1, test_size=0.5, random_state=42)

# Splitting the data for FV2
# Similar to FV1, split the FV2 data into training and temporary sets
X_train_fv2, X_temp_fv2, y_train_fv2, y_temp_fv2 = train_test_split(FV2['feature_vector'], FV2['labels'], test_size=0.4, random_state=42)
# Split the temporary set equally into validation and test sets for FV2
X_test_fv2, X_val_fv2, y_test_fv2, y_val_fv2 = train_test_split(X_temp_fv2, y_temp_fv2, test_size=0.5, random_state=42)

# Printing the size of each dataset to verify the splitting ratios
print("FV1 - Training set size:", X_train_fv1.shape[0])
print("FV1 - Validation set size:", X_val_fv1.shape[0])
print("FV1 - Test set size:", X_test_fv1.shape[0])

print("FV2 - Training set size:", X_train_fv2.shape[0])
print("FV2 - Validation set size:", X_val_fv2.shape[0])
print("FV2 - Test set size:", X_test_fv2.shape[0])


### Classifer
* SVM
* Naive Bayes
* K-NN

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# Define a function to train and evaluate models
def train_and_evaluate(X_train, X_test, y_train, y_test, feature_set_name):
    print(f"Results for {feature_set_name}:")

    # Train and evaluate a KNN classifier
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(X_train, y_train)
    y_pred_knn = knn.predict(X_test)
    print("Accuracy of KNN classifier: {:.2f}%".format(accuracy_score(y_test, y_pred_knn) * 100))
    print(classification_report(y_test, y_pred_knn))

    # Train and evaluate an SVM classifier
    svm_classifier = SVC(kernel='linear')
    svm_classifier.fit(X_train, y_train)
    svm_predictions = svm_classifier.predict(X_test)
    print("Accuracy of SVM classifier: {:.2f}%".format(accuracy_score(y_test, svm_predictions) * 100))
    print(classification_report(y_test, svm_predictions))

    # Train and evaluate a Naive Bayes classifier
    nb_classifier = MultinomialNB()
    nb_classifier.fit(X_train, y_train)
    nb_predictions = nb_classifier.predict(X_test)
    print("Accuracy of Naive Bayes classifier: {:.2f}%".format(accuracy_score(y_test, nb_predictions) * 100))
    print(classification_report(y_test, nb_predictions))

# Train and evaluate on the first feature vector set (FV1)
train_and_evaluate(X_train_fv1, X_test_fv1, y_train_fv1, y_test_fv1, "Feature Vector 1")

# Train and evaluate on the second feature vector set (FV2)
train_and_evaluate(X_train_fv2, X_test_fv2, y_train_fv2, y_test_fv2, "Feature Vector 2")


### Step 5

In [None]:
from sklearn.metrics import roc_curve, auc, precision_recall_curve
import matplotlib.pyplot as plt
import time

def train_and_evaluate(X_train, X_val, X_test, y_train, y_val, y_test, feature_set_name):
    print(f"Results for {feature_set_name}:")
    classifiers = {
        "KNN": KNeighborsClassifier(n_neighbors=3),
        "SVM": SVC(kernel='linear', probability=True),
        "Naive Bayes": MultinomialNB()
    }

    for name, clf in classifiers.items():
        start_time = time.time()
        clf.fit(X_train, y_train)
        training_time = time.time() - start_time

        start_time = time.time()
        y_pred = clf.predict(X_test)
        prediction_time = time.time() - start_time

        accuracy = accuracy_score(y_test, y_pred)
        print(f"{name} classifier:")
        print(f"Accuracy: {accuracy:.2f}")
        print(classification_report(y_test, y_pred))
        print(f"Training time: {training_time:.4f}s")
        print(f"Prediction time: {prediction_time:.4f}s")

        # Calculating ROC curve and AUC
        if name != "Naive Bayes":  # Assuming Naive Bayes might not support predict_proba
            y_score = clf.predict_proba(X_test)[:, 1]
            fpr, tpr, _ = roc_curve(y_test, y_score)
            roc_auc = auc(fpr, tpr)
            
            # Plot ROC curve
            plt.figure()
            plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.2f})')
            plt.plot([0, 1], [0, 1], 'k--')
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.05])
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title(f'ROC Curve for {name} using {feature_set_name}')
            plt.legend(loc="lower right")
            plt.savefig(f'ROC_Curve_for_{name}_using_{feature_set_name}.png')
            plt.close()  # Close the figure to free memory
        # For Naive Bayes, consider adding Precision-Recall curve if ROC is not suitable\
        if name == "Naive Bayes":
            precision, recall, _ = precision_recall_curve(y_test, y_score)
            pr_auc = auc(recall, precision)

            # Plot Precision-Recall curve
            plt.figure()
            plt.plot(recall, precision, label=f'Precision-Recall curve (area = {pr_auc:.2f})')
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.05])
            plt.xlabel('Recall')
            plt.ylabel('Precision')
            plt.title(f'Precision-Recall Curve for {name} using {feature_set_name}')
            plt.legend(loc="lower left")
            plt.savefig(f'Precision_Recall_Curve_for_{name}_using_{feature_set_name}.png')
            plt.close()  # Close the figure to free memory
            
# Train and evaluate on the first feature vector set (FV1)
train_and_evaluate(X_train_fv1, X_val_fv1, X_test_fv1, y_train_fv1, y_val_fv1, y_test_fv1, "Feature Vector 1")

# Train and evaluate on the second feature vector set (FV2)
train_and_evaluate(X_train_fv2, X_val_fv2, X_test_fv2, y_train_fv2, y_val_fv2, y_test_fv2, "Feature Vector 2")


In [None]:
import pandas as pd

# Define the data for the DataFrame
data = {
    "Classifier": ["KNN", "SVM", "Naive Bayes", "KNN", "SVM", "Naive Bayes"],
    "Feature Vector": ["1", "1", "1", "2", "2", "2"],
    "Accuracy": [0.68, 0.80, 0.78, 0.72, 0.86, 0.83],
    "Precision (0)": [0.69, 0.82, 0.80, 0.85, 0.85, 0.94],
    "Precision (1)": [0.67, 0.78, 0.75, 0.65, 0.87, 0.76],
    "Recall (0)": [0.70, 0.79, 0.76, 0.57, 0.89, 0.73],
    "Recall (1)": [0.66, 0.81, 0.79, 0.89, 0.83, 0.95],
    "F1-Score (0)": [0.70, 0.80, 0.78, 0.68, 0.87, 0.82],
    "F1-Score (1)": [0.66, 0.79, 0.77, 0.75, 0.85, 0.84],
    "Support (0)": [288, 288, 288, 288, 288, 288],
    "Support (1)": [262, 262, 262, 262, 262, 262],
    "Training Time (s)": [0.0020, 1.1283, 0.0021, 0.0010, 0.4109, 0.0020],
    "Prediction Time (s)": [0.0554, 0.0426, 0.0002, 0.0645, 0.0191, 0.0002],
    "AUC (ROC)": [0.74, 0.88, "NA", 0.78, 0.93, "NA"]  # Assuming AUC for Naive Bayes is not available
}

# Create the DataFrame
df_results = pd.DataFrame(data)

# Set a multi-index for clarity
df_results.set_index(["Feature Vector", "Classifier"], inplace=True)

# Show the DataFrame
df_results


### Self-implemented K-NN Classifer <- Does Not Work
it is not be feasible for large datasets, since for a dataset with 3,000 test samples and 3,000 training samples, the total number of distance calculations required would be the product of the two, resulting in approximately 9,000,000 individual computations.

So I call SK api instead.

In [None]:
# from scipy.sparse import csr_matrix
# import numpy as np
# from collections import Counter

# def knn_predict_sparse(X_train, X_test, y_train, y_test, k):
#     def euclidean_distance_sparse(x1, x2):
#         diff = x1 - x2
#         return np.sqrt(diff.dot(diff.T).toarray()[0, 0])

#     def predict(X):
#         predictions = []
#         for i in range(X.shape[0]):
#             x = X[i]
#             distances = [euclidean_distance_sparse(x, X_train[j]) for j in range(X_train.shape[0])]
#             k_indices = np.argsort(distances)[:k]
#             k_nearest_labels = [y_train.iloc[i] for i in k_indices]
#             most_common = Counter(k_nearest_labels).most_common(1)
#             predictions.append(most_common[0][0])
#         return predictions

#     y_pred = predict(X_test)
#     accuracy = np.mean(np.array(y_pred) == y_test.to_numpy())
#     return y_pred, accuracy

# k = 3
# y_pred, accuracy = knn_predict_sparse(X_train, X_test, y_train, y_test, k)

# print(f"Accuracy with k={k}: {accuracy*100:.2f}%")