In [295]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
import warnings
from sklearn.pipeline import Pipeline
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re 
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# ignore future warnings 
warnings.simplefilter(action='ignore')

[nltk_data] Downloading package punkt to /Users/mac/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mac/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [271]:
# Load the data
train_data = pd.read_csv('TRAIN.csv')
validation_data = pd.read_csv('VALIDATION.csv')
test_data = pd.read_csv('TEST_NO_LABELS.csv')

# Load the TF-IDF features for baselines 
tfidf_train = pd.read_csv('TFIDF_TRAIN.csv')
tfidf_validation = pd.read_csv('TFIDF_VALIDATION.csv')


y_train = train_data['rating']
y_test = validation_data['rating']

In [272]:
def clean_text(text):
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^A-Za-z]', ' ', text)
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenize the text
    words = word_tokenize(text)
    
    # Remove stopwords and perform lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]
    
    # Join the words back into a clean text
    cleaned_text = ' '.join(words)
    
    return cleaned_text

# Function to create a pipeline for data preprocessing and TF-IDF with a specified classifier
def create_pipeline(classifier):
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=2000)),  # TF-IDF vectorization
        ('classifier', classifier)  # Classifier (Logistic Regression, Multinomial Naive Bayes)
    ])
    return pipeline

In [273]:
# Function to test a model on a specific dataset
def test_model(model, X, y):
    # Make predictions
    y_pred = model.predict(X)

    # Calculate accuracy
    accuracy = accuracy_score(y, y_pred)
    print("Accuracy:", accuracy)

    class_report = classification_report(y, y_pred)
    print("Classification Report:\n", class_report)

In [274]:
# Clean the text data
train_data['review-text-cleaned'] = train_data['review-text-cleaned'].apply(clean_text)
validation_data['review-text-cleaned'] = validation_data['review-text-cleaned'].apply(clean_text)

# Create pipelines with different classifiers
logistic_regression_pipeline = create_pipeline(LogisticRegression())
naive_bayes_pipeline = create_pipeline(MultinomialNB())

In [275]:
# Function to model the data
def model_data(train_data, validation_data, pipeline, classifier_name):
    # Split the data into training and validation sets
    X_train = train_data['review-text-cleaned']
    y_train = train_data['rating']
    
    X_validation = validation_data['review-text-cleaned']
    y_validation = validation_data['rating']

    # Fit the model with the pipeline
    pipeline.fit(X_train, y_train)

    # Test the model on the validation data
    print(f"Testing on {classifier_name}:\n")
    test_model(pipeline, X_validation, y_validation)

In [277]:
# Model the data on the whole dataset using different classifiers
model_data(train_data, validation_data, logistic_regression_pipeline, "Logistic Regression (Whole Data)")
model_data(train_data, validation_data, naive_bayes_pipeline, "Multinomial Naive Bayes (Whole Data)")

Testing on Logistic Regression (Whole Data):

Accuracy: 0.9156363636363636
Classification Report:
               precision    recall  f1-score   support

          -1       0.83      0.86      0.84      1462
           1       0.95      0.94      0.94      4038

    accuracy                           0.92      5500
   macro avg       0.89      0.90      0.89      5500
weighted avg       0.92      0.92      0.92      5500

Testing on Multinomial Naive Bayes (Whole Data):

Accuracy: 0.893090909090909
Classification Report:
               precision    recall  f1-score   support

          -1       0.87      0.70      0.78      1462
           1       0.90      0.96      0.93      4038

    accuracy                           0.89      5500
   macro avg       0.88      0.83      0.85      5500
weighted avg       0.89      0.89      0.89      5500



In [278]:
# Divide validation data based on gender
female_validation_data = validation_data[validation_data['dr_id_gender'] == 0]
male_validation_data = validation_data[validation_data['dr_id_gender'] == 1]
unknown_gender_validation_data = validation_data[validation_data['dr_id_gender'] == 2]

# Model the data for each gender category separately using different classifiers
model_data(train_data, female_validation_data, logistic_regression_pipeline, "Logistic Regression (Female)")
model_data(train_data, male_validation_data, logistic_regression_pipeline, "Logistic Regression(Male)")
model_data(train_data, unknown_gender_validation_data, logistic_regression_pipeline, "Logistic Regression(Unknown)")

model_data(train_data, female_validation_data, naive_bayes_pipeline, "Multinomial Naive Bayes (Female)")
model_data(train_data, male_validation_data, naive_bayes_pipeline, "Multinomial Naive Bayes (Male)")
model_data(train_data, unknown_gender_validation_data, naive_bayes_pipeline, "Multinomial Naive Bayes (Unknown)")

Testing on Logistic Regression (Female):

Accuracy: 0.9291917167668671
Classification Report:
               precision    recall  f1-score   support

          -1       0.87      0.86      0.86       392
           1       0.95      0.95      0.95      1105

    accuracy                           0.93      1497
   macro avg       0.91      0.91      0.91      1497
weighted avg       0.93      0.93      0.93      1497

Testing on Logistic Regression(Male):

Accuracy: 0.923828125
Classification Report:
               precision    recall  f1-score   support

          -1       0.86      0.86      0.86       983
           1       0.95      0.95      0.95      2601

    accuracy                           0.92      3584
   macro avg       0.90      0.90      0.90      3584
weighted avg       0.92      0.92      0.92      3584

Testing on Logistic Regression(Unknown):

Accuracy: 0.7971360381861575
Classification Report:
               precision    recall  f1-score   support

          -1    

In [284]:
#Model for calculating the baselines using OneR and ZeroR
def baselines(xtrain,ytrain,xtest,ytest):
    
    OneR_Acc_1 = []
    ZeroR_Acc_1 = []
    
    # One-R model
    OneR_Model = DummyClassifier(strategy="prior")
    OneR_Model.fit(xtrain, ytrain)
    OneR_Predictions = OneR_Model.predict(xtest)
    OneR_Acc = accuracy_score(ytest, OneR_Predictions)
    OneR_Acc_1.append(OneR_Acc)
    
    print("Accuracy of One-R:", np.mean(OneR_Acc_1).round(2))
    class_report = classification_report(ytest, OneR_Predictions)
    print("Classification Report:\n", class_report)

In [285]:
baselines(tfidf_train,y_train,tfidf_validation,validation_data.rating)

Accuracy of One-R: 0.73
Classification Report:
               precision    recall  f1-score   support

          -1       0.00      0.00      0.00      1462
           1       0.73      1.00      0.85      4038

    accuracy                           0.73      5500
   macro avg       0.37      0.50      0.42      5500
weighted avg       0.54      0.73      0.62      5500



In [286]:
baselines(tfidf_train,y_train,tfidf_validation_male,validation_male_df.rating)

Accuracy of One-R: 0.73
Classification Report:
               precision    recall  f1-score   support

          -1       0.00      0.00      0.00       983
           1       0.73      1.00      0.84      2601

    accuracy                           0.73      3584
   macro avg       0.36      0.50      0.42      3584
weighted avg       0.53      0.73      0.61      3584



In [287]:
baselines(tfidf_train,y_train,tfidf_validation_female,validation_female_df.rating)

Accuracy of One-R: 0.74
Classification Report:
               precision    recall  f1-score   support

          -1       0.00      0.00      0.00       392
           1       0.74      1.00      0.85      1105

    accuracy                           0.74      1497
   macro avg       0.37      0.50      0.42      1497
weighted avg       0.54      0.74      0.63      1497



In [288]:
baselines(tfidf_train,y_train,tfidf_validation_unknown,validation_unknown_df.rating)

Accuracy of One-R: 0.79
Classification Report:
               precision    recall  f1-score   support

          -1       0.00      0.00      0.00        87
           1       0.79      1.00      0.88       332

    accuracy                           0.79       419
   macro avg       0.40      0.50      0.44       419
weighted avg       0.63      0.79      0.70       419



In [290]:
#Predictions on the Test data. 

# Clean the text data
train_data['review-text-cleaned'] = train_data['review-text-cleaned'].apply(clean_text)
test_data['review-text-cleaned'] = test_data['review-text-cleaned'].apply(clean_text)

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=2000)  # TF-IDF vectorization

# Transform the text data into TF-IDF features
X_train = tfidf_vectorizer.fit_transform(train_data['review-text-cleaned'])
y_train = train_data['rating']

X_test = tfidf_vectorizer.transform(test_data['review-text-cleaned'])

# Create a Logistic Regression model
logistic_regression_model = LogisticRegression()

# Fit the model on the training data
logistic_regression_model.fit(X_train, y_train)

# Make predictions on the test data
test_predictions = logistic_regression_model.predict(X_test)

In [291]:
test_predictions

array([ 1,  1,  1, ...,  1, -1,  1])

In [296]:
import pkg_resources
import types

def get_imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            yield val.__name__

def get_version(pkg):
    try:
        return pkg_resources.get_distribution(pkg).version
    except pkg_resources.DistributionNotFound:
        return None

imports = list(get_imports())
packages = [pkg for pkg in imports if pkg != 'builtins']

versions = {pkg: get_version(pkg) for pkg in packages}

# Save the versions to a text file
with open('README3.txt', 'w') as file:
    for pkg, version in versions.items():
        file.write(f"{pkg}: {version}\n")

In [298]:
sklearn_version = re.__version__

print(f"scikit-learn version: {sklearn_version}")


scikit-learn version: 2.2.1
