In [1]:
import random
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
random.seed(42)
np.random.seed(42)

train_file_path = '/content/drive/MyDrive/Colab Notebooks/train.csv'
test_file_path = '/content/drive/MyDrive/Colab Notebooks/test.csv'

train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)

print(train_df.head())

        Id   ProductId          UserId  HelpfulnessNumerator  \
0   914403  B0009W5KHM   AV6QDP8Q0ONK4                     2   
1   354887  6303079709  A2I8RXJN80A2D2                     0   
2  1407653  B004H0M2XC  A3FHV3RV8Z12E6                     0   
3  1377458  B003ZJ9536  A12VLTA3ZHVPUY                     1   
4   475323  630574453X  A13NM1PES9OXVN                     2   

   HelpfulnessDenominator        Time  \
0                       2  1341014400   
1                       0  1168819200   
2                       0  1386201600   
3                       1  1348704000   
4                       3   970012800   

                                         Summary  \
0                                  GOOD FUN FILM   
1                                   Movie Review   
2             When is it a good time to Consent?   
3                                          TRUTH   
4  Intelligent and bittersweet -- stays with you   

                                                Text  S

In [4]:
test_df = test_df.merge(
    train_df[['Id', 'ProductId', 'UserId', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time', 'Summary', 'Text']],
    on='Id',
    how='left'
    )

print(test_df.head())

        Id  Score   ProductId          UserId  HelpfulnessNumerator  \
0  1323432    NaN  B0034G4P30  A120UTHQDQIJGH                     0   
1  1137299    NaN  B0012IWO0I  A3SJBFCTJWBFT2                     1   
2  1459366    NaN  B005FUTBSC  A1V6FRU7EXP6N9                     0   
3   931601    NaN  B000AREXBU   ARXDDR76Z5Q2I                     5   
4  1311995    NaN  B002ZG99B8   A2XAS9GVZL3B1                     2   

   HelpfulnessDenominator        Time  \
0                       0  1391040000   
1                       1  1388188800   
2                       0  1356739200   
3                       5  1169510400   
4                       2  1392595200   

                                             Summary  \
0                                  Okay for a rental   
1                                     Great for kids   
2                                         good movie   
3  Excellent quality dvd.  Don't pay big bucks fo...   
4                                     A mixed 

In [5]:
def add_helpfulness_ratio(df):
    if 'HelpfulnessRatio' not in df.columns:
        def calculate_helpfulness_ratio(row):
            return row['HelpfulnessNumerator'] / row['HelpfulnessDenominator'] if row['HelpfulnessDenominator'] != 0 else 0
        df['HelpfulnessRatio'] = df.apply(calculate_helpfulness_ratio, axis=1)
    return df

def add_date_features(df):
    if 'Year' not in df.columns or 'Month' not in df.columns or 'Day' not in df.columns:
        df['Year'] = pd.to_datetime(df['Time'], unit='s').dt.year
        df['Month'] = pd.to_datetime(df['Time'], unit='s').dt.month
        df['Day'] = pd.to_datetime(df['Time'], unit='s').dt.day
    return df

def add_tfidf_features(train_df, test_df, n_tfidf_features=100, n_svd_components=30):
    if not any(col.startswith('TFIDF_') for col in train_df.columns):
        tfidf_vectorizer = TfidfVectorizer(max_features=n_tfidf_features, stop_words='english')
        train_text_combined = train_df['Summary'].fillna('') + " " + train_df['Text'].fillna('')
        tfidf_train = tfidf_vectorizer.fit_transform(train_text_combined)

        svd = TruncatedSVD(n_components=n_svd_components, random_state=42)
        tfidf_train_reduced = svd.fit_transform(tfidf_train)
        tfidf_train_df = pd.DataFrame(tfidf_train_reduced, columns=[f'TFIDF_SVD_{i}' for i in range(n_svd_components)])
        train_df = pd.concat([train_df.reset_index(drop=True), tfidf_train_df], axis=1)

        # Transform test data with the same vectorizer and SVD
        test_text_combined = test_df['Summary'].fillna('') + " " + test_df['Text'].fillna('')
        tfidf_test = tfidf_vectorizer.transform(test_text_combined)
        tfidf_test_reduced = svd.transform(tfidf_test)
        tfidf_test_df = pd.DataFrame(tfidf_test_reduced, columns=[f'TFIDF_SVD_{i}' for i in range(n_svd_components)])
        test_df = pd.concat([test_df.reset_index(drop=True), tfidf_test_df], axis=1)

    return train_df, test_df

# Apply features to training and testing sets
train_df = add_helpfulness_ratio(train_df)
train_df = add_date_features(train_df)
test_df = add_helpfulness_ratio(test_df)
test_df = add_date_features(test_df)

# Add TF-IDF features with dimensionality reduction using TruncatedSVD
train_df, test_df = add_tfidf_features(train_df, test_df, n_tfidf_features=50, n_svd_components=20)

In [6]:
# Drop rows where Score is null in train_df
train_df = train_df.dropna(subset=['Score'])

# Prepare the training set
X_train = train_df.drop(columns=['Id', 'ProductId', 'UserId', 'Score', 'Summary', 'Text'], errors='ignore')
y_train = train_df['Score']  # Use actual Score values as the target variable

# Split the training set into training and validation sets
X_train_set, X_val_set, y_train_set, y_val = train_test_split(
    X_train,
    y_train,
    test_size=0.25,
    random_state=0
)

In [7]:
# Train the RandomForest model
rf_model = RandomForestClassifier(n_estimators=50, random_state=42)
rf_model.fit(X_train_set, y_train_set)

# Predict on validation set using RandomForest
y_pred_rf = rf_model.predict(X_val_set)
accuracy_rf = (y_pred_rf == y_val).mean()

# Store RandomForest evaluation metrics
evaluation_results_rf = {
    "Model": "RandomForest Classifier",
    "Accuracy": accuracy_rf
}
print("RandomForest Validation Accuracy:", accuracy_rf)

RandomForest Validation Accuracy: 0.5487698472542387


In [29]:
# Prepare test data for prediction, excluding non-feature columns
X_test = test_df.drop(columns=['Id', 'ProductId', 'UserId', 'Summary', 'Text', 'Score'], errors='ignore')

# Predict the Score on the test set and store results in test_df
test_df['Score'] = rf_model.predict(X_test)

# Prepare the submission file with only Id and Score columns
submission = test_df[['Id', 'Score']]
submission.to_csv("submission.csv", index=False)
submission.head()

In [31]:
from google.colab import files
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>