In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


In [17]:
import pandas as pd

df = pd.read_excel("./dataset/essay_dataset_v1.xlsx")


In [9]:
# Columns A–E (ID, score, stopword_count, unique_words, etc.)
cols_meta = df.iloc[:, 0:5]

# Columns F → end = word-count features
cols_tf = df.iloc[:, 5:]

print("Meta columns:", cols_meta.columns)
print("TF columns:", len(cols_tf.columns))

Meta columns: Index(['document_number', 'domain1_score', 'word_count', 'stopword',
       'unique_words'],
      dtype='object')
TF columns: 1218


In [10]:
# TF-IDF transformer
tfidf = TfidfTransformer()

# Convert raw word counts → TF-IDF
X_tfidf = tfidf.fit_transform(cols_tf).toarray()

# Convert back to DataFrame with original word names
df_tfidf_words = pd.DataFrame(X_tfidf, columns=cols_tf.columns, index=df.index)


In [12]:
# Concatenate metadata + TF-IDF features
df_tfidf = pd.concat([cols_meta, df_tfidf_words], axis=1)

# Save to Excel
df_tfidf.to_excel("./dataset/dataset_after_tfidf.xlsx", index=False)

print("Saved: dataset_after_tfidf.xlsx")


Saved: dataset_after_tfidf.xlsx


In [13]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_tfidf)

print("Scaled TF-IDF shape:", X_scaled.shape)


Scaled TF-IDF shape: (141, 1218)


In [14]:
# Choose number of components
NUM_COMPONENTS = 50   # change to what you want

pca = PCA(n_components=NUM_COMPONENTS)
X_pca = pca.fit_transform(X_scaled)

print("PCA output shape:", X_pca.shape)


PCA output shape: (141, 50)


In [15]:
pca_cols = [f"PC{i+1}" for i in range(X_pca.shape[1])]

df_pca_words = pd.DataFrame(X_pca, columns=pca_cols, index=df.index)


In [20]:
df_pca = pd.concat([cols_meta, df_pca_words], axis=1)

# Save to Excel
df_pca.to_excel("./dataset/dataset_after_pca.xlsx", index=False)

print("Saved: dataset_after_pca.xlsx")


Saved: dataset_after_pca.xlsx
