In [1]:
import pandas as pd
import numpy as np
from nltk.stem.snowball import SnowballStemmer
import re
import datetime
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def process(df: pd.DataFrame):

    df['ProductCode'], uniques = pd.factorize(df['ProductId'])
    df['UserCode'], uniques = pd.factorize(df['UserId'])

    df['SummaryLength'] = df['Summary'].str.split().str.len()
    df['TextLength'] = df['Text'].str.split().str.len()

    for col in ['ProductCode', 'UserCode', 'Time', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'SummaryLength', 'TextLength']:
        df[col] = (df[col] - df[col].mean())/df[col].std()

    return df.drop(columns=['ProductId', 'UserId'])

In [3]:
def summary_process(df: pd.DataFrame):
    df['Summary'] = df['Summary'].fillna('')
    snow_stemmer = SnowballStemmer(language='english')
    for i in range(len(df)):
        if i%100000 == 0:
            print("summary process: ", i)
        words = []
        split_summary = re.findall(r'\w+', df.loc[i, 'Summary'])
        for w in split_summary:
            x = snow_stemmer.stem(w)
            words.append(x)
        df.at[i, 'Summary'] = ' '.join(words)
    return df

def summary_analysis(df: pd.DataFrame):
    vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.75)
    tfidV_vectors = vectorizer.fit_transform(df['Summary'].to_numpy())
    res_mat = []
    for i in range(tfidV_vectors.shape[0]):
        res_mat.append(np.asarray(tfidV_vectors[i].todense()).reshape(-1))
    result_df = pd.DataFrame(res_mat, columns=vectorizer.get_feature_names_out())
    return pd.concat([df, result_df], axis=1)

In [4]:
def text_process(df: pd.DataFrame):
    start = datetime.datetime.now()
    df['Text'] = df['Text'].fillna('')
    snow_stemmer = SnowballStemmer(language='english')
    for i in range(len(df)):
        if i%100000 == 0:
            print("text process: ", i)
        words = []
        split_summary = re.findall(r'\w+', df.loc[i, 'Text'])
        for w in split_summary:
            x = snow_stemmer.stem(w)
            words.append(x)
        df.at[i, 'Text'] = ' '.join(words)
    end = datetime.datetime.now()
    print("running time:", end-start)
    return df

def text_analysis(df: pd.DataFrame):
    print("text analysis:")
    start = datetime.datetime.now()
    vectorizer = TfidfVectorizer()
    tfidV_vectors = vectorizer.fit_transform(df['Text'].to_numpy())
    res_mat = []
    for i in range(tfidV_vectors.shape[0]):
        res_mat.append(np.asarray(tfidV_vectors[i].todense()).reshape(-1))
    result_df = pd.DataFrame(res_mat, columns=vectorizer.get_feature_names_out())
    end = datetime.datetime.now()
    print("running time:", end-start)
    return pd.concat([df, result_df], axis=1)

In [5]:
# Load the dataset
trainingSet = pd.read_csv("./data/train.csv")
# Load test set
submissionSet = pd.read_csv("./data/test.csv")
print("train length: ", len(trainingSet))
print("submission length: ", len(submissionSet))

train length:  1697533
submission length:  300000


In [6]:
# trainingSet = summary_process(trainingSet)
trainingSet = text_process(trainingSet)

text process:  0
text process:  100000
text process:  200000
text process:  300000
text process:  400000
text process:  500000
text process:  600000
text process:  700000
text process:  800000
text process:  900000
text process:  1000000
text process:  1100000
text process:  1200000
text process:  1300000
text process:  1400000
text process:  1500000
text process:  1600000
running time: 0:39:03.905326


In [7]:
# trainingSet = summary_analysis(trainingSet)
# trainingSet.columns.values.tolist()

In [None]:
trainingSet = text_analysis(trainingSet)
trainingSet.columns.values.tolist()

In [None]:
# Process the DataFrame
train_processed = process(trainingSet)
train_processed.columns.values.tolist()

In [None]:
# Merge on Id so that the test set can have feature columns as well
testX= pd.merge(train_processed, submissionSet, left_on='Id', right_on='Id')
testX = testX.drop(columns=['Score_x'])
testX = testX.rename(columns={'Score_y': 'Score'})

# The training set is where the score is not null
trainX =  train_processed[train_processed['Score'].notnull()]

testX.to_csv("./data/X_test_text.csv", index=False)
trainX.to_csv("./data/X_train_text.csv", index=False)