In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD
import re
import gc
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [2]:
# Step 1: Load the data
train_data = pd.read_csv('train.csv')
test_ids = pd.read_csv('test.csv')

test_data = train_data.merge(test_ids['Id'], on="Id")
train_data = train_data.dropna(subset="Score")

train_data = train_data.sample(frac=0.7, random_state=42)

In [3]:
print(train_data.shape)
print(test_data.shape)

(1039739, 9)
(212192, 9)


In [4]:
# Step 2: preprocessing by filling in blank values and converting values to proper format

def preprocess_1(data):
    data['HelpfulnessRatio'] = np.where(data['HelpfulnessDenominator'] > 0,
                                                   data['HelpfulnessNumerator'] / data['HelpfulnessDenominator'], 
                                                   0)
    
    data['LogHelpfulnessNumerator'] = np.log1p(data['HelpfulnessNumerator'])
    data['LogHelpfulnessDenominator'] = np.log1p(data['HelpfulnessDenominator'])
    
    # Convert 'Time' to datetime and extract year and month if 'Time' column exists
    if 'Time' in data.columns:
        data['Time'] = pd.to_datetime(data['Time'], unit='s')
        data['Year'] = data['Time'].dt.year
        data['Month'] = data['Time'].dt.month
    else:
        # If 'Time' column is missing, fill with default values
        data['Year'] = 0
        data['Month'] = 0
    return data

train_data = preprocess_1(train_data)
test_data = preprocess_1(test_data)

In [5]:
from pandarallel import pandarallel 

def tokenize(x):
    return word_tokenize(str(x).lower())

pandarallel.initialize(progress_bar=True)

train_data['TextTokens'] = train_data['Text'].parallel_apply(tokenize)
train_data['SummaryTokens'] = train_data['Summary'].parallel_apply(tokenize)
test_data['TextTokens'] = test_data['Text'].parallel_apply(tokenize)
test_data['SummaryTokens'] = test_data['Summary'].parallel_apply(tokenize)

INFO: Pandarallel will run on 11 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=94522), Label(value='0 / 94522')))…

Python(18436) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=94522), Label(value='0 / 94522')))…

Python(18437) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18438) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18439) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18440) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18441) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18442) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18443) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18444) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18445) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18446) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18447) Malloc

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=19291), Label(value='0 / 19291')))…

Python(18514) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18515) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18516) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18517) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18518) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18519) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18520) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18521) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18522) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18523) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18524) Malloc

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=19291), Label(value='0 / 19291')))…

Python(18572) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18573) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18574) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18575) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18576) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18577) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18578) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18579) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18580) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18581) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18582) Malloc

In [6]:
# Step 3: sentiment analysis by positive and negative words

def preprocess_2(data):
    # these words are some words that can reasonably be expected to be found in positive and negative reviews
    positive_words = set([
        "amazing", "awesome", "best", "brilliant", "captivating", "charming", "compelling", 
        "delightful", "enjoyable", "entertaining", "excellent", "fantastic", "fascinating", 
        "fun", "great", "impressive", "incredible", "masterpiece", "memorable", "outstanding", 
        "perfect", "remarkable", "spectacular", "stunning", "superb", "thrilling", "touching", 
        "unforgettable", "wonderful", "worthy"
    ])
    
    negative_words = set([
        "awful", "boring", "cliched", "disappointing", "dreadful", "dull", "flat", "forgettable", 
        "horrible", "inconsistent", "lame", "mediocre", "messy", "nonsensical", "poor", "predictable", 
        "ridiculous", "shallow", "stale", "terrible", "unconvincing", "uninteresting", "uninspired", 
        "weak", "worthless", "worse", "worst"
    ])

    neutral_words = set([
        "mediocre", "average", "okay", "passable", "acceptable", "fine", "decent", "typical",
        "standard", "ordinary", "satisfactory", "modest", "usual", "unremarkable", "run-of-the-mill",
        "expected", "serviceable", "all right", "not bad", "workable", "sufficient"
    ])
    
    def simple_sentiment_analysis(text):
        """ takes text and checks it for positive and negative words (connotations) """
        pos, neg, neu = 0, 0, 0
        
        for word in text:
            if word in positive_words:
                pos += 1
            if word in negative_words:
                neg += 1
            if word in neutral_words:
                neu += 1
            
        return pos, neg, neu, pos-neg  # Positive values are more positive, negative values more negative

    # Apply sentiment analysis to Text and Summary
    data['PositiveWords'], data['NegativeWords'], data['NeutralWords'], data['SentimentText'] = zip(*data['TextTokens'].apply(simple_sentiment_analysis))
    _, _, _, data['SentimentSummary'] = zip(*data['SummaryTokens'].apply(simple_sentiment_analysis))

    return data

train_data = preprocess_2(train_data)
test_data = preprocess_2(test_data)

In [7]:
# Step 4: Feature Engineering - Text Preprocessing

def preprocess_3(data):
    def clean_text(text):
        text = re.sub(r'[^a-zA-Z!.?\s]', '', str(text))
        text = text.lower()
        return text
    
    # Apply text cleaning to training data
    data['CleanText'] = data['Text'].apply(clean_text)
    data['CleanSummary'] = data['Summary'].apply(clean_text)
    
    # this line adds a new feature that is the length of the text
    data['TextLen'] = data['CleanText'].apply(lambda x : len(x))
    data['AvgWordLen'] = data['TextTokens'].apply(lambda x : np.mean([len(word) for word in x]))
    
    # this line adds a new feature that is the number of exclamation marks in the text
    data['NumExclamations'] = data['CleanText'].apply(lambda x : x.count('!'))
    data['NumQuestions'] = data['CleanText'].apply(lambda x : x.count('?'))

    return data

train_data = preprocess_3(train_data)
test_data = preprocess_3(test_data)

In [9]:
# Load GloVe embeddings into a dictionary
def load_glove_embeddings(file_path, embedding_dim=100):
    embeddings_index = {}
    with open(file_path, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.array(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Create an embedding for each document by averaging word embeddings
def get_document_embedding(text, embeddings_index, embedding_dim=100):
    word_vectors = [embeddings_index[word] for word in text if word in embeddings_index]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(embedding_dim)

# Load the GloVe embeddings (adjust file path and dimension as needed)
glove_path = '/Users/victorialin/Downloads/glove/glove.6B.300d.txt'  # Update this to your file path
embedding_dim = 300  # Adjust based on the GloVe file used
embeddings_index = load_glove_embeddings(glove_path, embedding_dim=embedding_dim)

print("Loaded glove")

text_vectors_train = np.array([get_document_embedding(review, embeddings_index, embedding_dim) for review in train_data['TextTokens']])

summary_vectors_train = np.array([get_document_embedding(review, embeddings_index, embedding_dim) for review in train_data['SummaryTokens']])

print("done with train set")

text_vectors_test = np.array([get_document_embedding(review, embeddings_index, embedding_dim) for review in test_data['TextTokens']])
summary_vectors_test = np.array([get_document_embedding(review, embeddings_index, embedding_dim) for review in test_data['SummaryTokens']])

print("done with test set")

del embeddings_index

Loaded glove
done with train set
done with test set


In [10]:
# convert ProductIds to one-hot, do PCA
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import PCA

encoder = HashingVectorizer()
encoded_ids = encoder.fit_transform(train_data['ProductId'])
encoded_ids_test = encoder.transform(test_data['ProductId'])

pca = PCA(n_components=20)  # Adjust n_components based on desired dimensionality
reduced_ids = pca.fit_transform(encoded_ids)
reduced_ids_test = pca.transform(encoded_ids_test)

In [11]:
print(reduced_ids_test.shape)

(212192, 20)


In [12]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

print("Applying sentiment analysis...")
def sentiment_scores(text):
    scores = analyzer.polarity_scores(text)
    return pd.Series({
        'Sentiment_Neg': scores['neg'],
        'Sentiment_Neu': scores['neu'],
        'Sentiment_Pos': scores['pos'],
        'Sentiment_Compound': scores['compound']
    })
sentiment_df = train_data['CleanText'].parallel_apply(sentiment_scores)
train_data = pd.concat([train_data.reset_index(drop=True), sentiment_df.reset_index(drop=True)], axis=1)

Applying sentiment analysis...


Python(18686) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=94522), Label(value='0 / 94522')))…

Python(18687) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18688) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18689) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18690) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18691) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18692) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18693) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18695) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18696) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18697) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18698) Malloc

In [13]:
sentiment_test = test_data['CleanText'].parallel_apply(sentiment_scores)
test_data = pd.concat([test_data.reset_index(drop=True), sentiment_test.reset_index(drop=True)], axis=1)

Python(18955) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=19291), Label(value='0 / 19291')))…

Python(18956) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18957) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18958) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18959) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18960) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18961) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18962) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18963) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18964) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18965) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(18966) Malloc

In [14]:
# Standardize numeric features
feats = [
    'HelpfulnessRatio', 'LogHelpfulnessNumerator', 'LogHelpfulnessDenominator', 'SentimentText', 'SentimentSummary', 
    'Year', 'Month', 'TextLen', 'NumExclamations', 'NumQuestions', 'PositiveWords', 'NegativeWords', 'AvgWordLen', 'NeutralWords',
    'Sentiment_Neg', 'Sentiment_Neu', 'Sentiment_Pos', 'Sentiment_Compound'
]

numeric_features_train = train_data[feats]
numeric_features_test = test_data[feats]

scaler = StandardScaler()
numeric_features_scaled = scaler.fit_transform(numeric_features_train)
numeric_features_scaled_test = scaler.transform(numeric_features_test)


# this is the FULL DATASET THAT IS FULLY CLEANED
X = np.hstack([numeric_features_scaled, text_vectors_train, summary_vectors_train, reduced_ids])
X_test = np.hstack([numeric_features_scaled_test, text_vectors_test, summary_vectors_test, reduced_ids_test])
y = train_data['Score']

In [15]:
print(X.shape)

(1039739, 638)


In [16]:
# Step 6: data splitting

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# # shift y values for XGBoost
y_train_shifted = [int(x-1) for x in y_train]
y_val_shifted = [int(x-1) for x in y_val]

In [22]:
logreg = LogisticRegression(solver='lbfgs', max_iter=1000, n_jobs=-1)
logreg.fit(X_train, y_train_shifted)

y_pred = logreg.predict(X_val)

print("Accuracy:", accuracy_score(y_val_shifted, y_pred))
print("Classification Report:\n", classification_report(y_val_shifted, y_pred))

Python(19048) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(19049) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(19050) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(19051) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(19052) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(19053) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(19054) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(19055) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(19056) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(19057) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(19058) Malloc

Accuracy: 0.6397224306076519
Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.63      0.62     12751
           1       0.42      0.23      0.30     12468
           2       0.43      0.34      0.38     24597
           3       0.47      0.29      0.36     46920
           4       0.72      0.90      0.80    111212

    accuracy                           0.64    207948
   macro avg       0.53      0.48      0.49    207948
weighted avg       0.60      0.64      0.61    207948



In [26]:
# Step 8: Generate Predictions for the Test Set

# predict
y_test_pred = logreg.predict(X_test)

# shift back to class labels
y_test_pred = y_test_pred + 1

# Step 9: Prepare the submission file
submission = pd.DataFrame({
    'Id': test_data['Id'],
    'Score': y_test_pred
})

submission.to_csv('submission.csv', index=False)
print("Submission file generated!")

# Optional: Clear memory to avoid kernel crashes
gc.collect()

Submission file generated!


140