# C. Clustering-based augmentation

In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import silhouette_score
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
import warnings
warnings.filterwarnings("ignore")

In [None]:
train_df = pd.read_csv("train_df2.csv")
train_df.drop(axis=1,columns=['Unnamed: 0'],inplace=True)
train_df

## Use K-Means, based on an approprate text representation and the (estimated) optimum K, to cluster the generated essays, and then the student essays.

In [None]:
generated_df = train_df[train_df['generated']==1]
student_df = train_df[train_df['generated']==0]

In [None]:
generated_df['tokenized_text'] = generated_df['text'].apply(lambda x: [word.lower() for word in word_tokenize(x) if word.lower() not in ENGLISH_STOP_WORDS])
model1 = Word2Vec(sentences=generated_df['tokenized_text'],epochs=10,
                                vector_size=300, 
                                window=3,
                                sg=0,
                                min_count=2,
                                workers=10)

In [None]:
student_df['tokenized_text'] = student_df['text'].apply(lambda x: [word.lower() for word in word_tokenize(x) if word.lower() not in ENGLISH_STOP_WORDS])
model2 = Word2Vec(sentences=student_df['tokenized_text'],epochs=10,
                                vector_size=300, 
                                window=3,
                                sg=0,
                                min_count=2,
                                workers=10)

In [None]:
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float32")
    nwords = 0.
    for word in words:
        if word in vocabulary:
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.wv[word])
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
    return feature_vector

def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index_to_key)
    features = [average_word_vectors(tokenized_text, model, vocabulary, num_features)
                for tokenized_text in corpus]
    return np.array(features)

def get_top_words_for_cluster(kmeans_model, vectorizer_model, num_words=10):
    order_centroids = kmeans_model.cluster_centers_.argsort()[:, ::-1]
    feature_names = vectorizer_model.wv.index_to_key
    top_words_per_cluster = {}
    for i in range(kmeans_model.n_clusters):
        top_words = [feature_names[ind] for ind in order_centroids[i, :num_words]]
        top_words_per_cluster[f'Cluster {i}'] = top_words
    return top_words_per_cluster

In [None]:
word2vec_features = averaged_word_vectorizer(corpus=generated_df['tokenized_text'], model=model1, num_features=300)

max_clusters = 10
silhouette_scores = []

for num_clusters in range(2, max_clusters + 1):
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(word2vec_features)
    labels = kmeans.labels_
    silhouette_avg = silhouette_score(word2vec_features, labels)
    silhouette_scores.append(silhouette_avg)

optimal_num_clusters = silhouette_scores.index(max(silhouette_scores)) + 2  # +2 because we started from 2 clusters

kmeans = KMeans(n_clusters=optimal_num_clusters, random_state=42)
kmeans.fit_predict(word2vec_features)
generated_df['cluster'] = kmeans.predict(word2vec_features)

top_words_per_cluster = get_top_words_for_cluster(kmeans, model1, num_words=20)

for cluster, top_words in top_words_per_cluster.items():
    print(f"Cluster {cluster}: {', '.join(top_words)}")

## Yield a title per cluster, reflecting the topic of the texts included.

Cluster 0: "Critical Infrastructure Protection"

Cluster 1: "Air Quality and Environmental Perspectives"

In [None]:
word2vec_features = averaged_word_vectorizer(corpus=student_df['tokenized_text'], model=model1, num_features=300)

# Choose the optimal number of clusters using silhouette score
max_clusters = 10
silhouette_scores = []

for num_clusters in range(2, max_clusters + 1):
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(word2vec_features)
    labels = kmeans.labels_
    silhouette_avg = silhouette_score(word2vec_features, labels)
    silhouette_scores.append(silhouette_avg)

optimal_num_clusters = silhouette_scores.index(max(silhouette_scores)) + 2  # +2 because we started from 2 clusters

# Perform K-means clustering with the optimal number of clusters
kmeans = KMeans(n_clusters=optimal_num_clusters, random_state=42)
kmeans.fit_predict(word2vec_features)
student_df['cluster'] = kmeans.predict(word2vec_features)

# Get the top words for each cluster
top_words_per_cluster = get_top_words_for_cluster(kmeans, model2, num_words=20)

# Display the top words for each cluster
for cluster, top_words in top_words_per_cluster.items():
    print(f"Cluster {cluster}: {', '.join(top_words)}")

Cluster 0: "Presidential Campaigns and Strategies"

Cluster 1: "Diverse Opinions on Presidential Driving Factors"

## Compare the cluster balance (number of instances per cluster) between the two clusterings.

In [None]:
student_df['cluster'].value_counts()

In [None]:
generated_df['cluster'].value_counts()

## Generate more texts (as in A) in order to better balance your clusters.

In [None]:
# student_cluster_1_prompt = 'Write an essay, up to 600 words with topic: "Diverse Opinions on Presidential Driving Factors". Similar essays had as top 20 words: smaller, presidential, duffer, united, driving, does, away, popular, count, 4, reasons, congress, free, walter, government, didnt, bogota, little, process, number'

In [None]:
# generated_cluster_0_prompt = 'Write an essay, up to 600 words with topic:"Critical Infrastructure Protection". Similar essays had as top 20 words:safeguard, popular, numerous, infrastructure, voices, heavily, matter, spaces, critics, thank, outcomes, intro, activity, imperative, economic, transit, promote, representative, inclusive, prevents'

In [None]:
'''from openai import OpenAI
import json
essays_list = []
client = OpenAI(api_key = "")

for i in range(40):
    response = client.chat.completions.create(
      model="gpt-3.5-turbo-1106",
      response_format={ "type": "json_object" },
      messages=[
        {"role": "system", "content": "You are a helpful assistant designed to output JSON."},
        {"role": "user", "content": student_cluster_1_prompt}
      ]
    )
    text = response.choices[0].message.content
    essays_list.append(text)
for i in range(170):
    response = client.chat.completions.create(
      model="gpt-3.5-turbo-1106",
      response_format={ "type": "json_object" },
      messages=[
        {"role": "system", "content": "You are a helpful assistant designed to output JSON."},
        {"role": "user", "content": generated_cluster_0_prompt}
      ]
    )
    text = response.choices[0].message.content
    essays_list.append(text)'''

In [None]:
'''import re
def remove_punctuation_except_periods(text):
    return re.sub(r'[^\w\s.]', '', text)

def remove_specific_words(text):
    words_to_remove = ['essay', 'body', 'introduction','conclusion','title','response',' \n'] + [f'paragraph{x}' for x in range(10)]  # Adjust the range as needed
    pattern = '\\b(?:' + '|'.join(map(re.escape, words_to_remove)) + ')\\b'
    return re.sub(pattern, '', text, flags=re.IGNORECASE)

def clean_text(text):
    text = remove_punctuation_except_periods(text)
    text = remove_specific_words(text)
    text = re.sub(r' +', ' ', text)
    text = text.strip()
    return text'''

In [None]:
'''train_df1 = pd.read_csv('train_df.csv')
train_df2 = pd.read_csv('train_df2.csv')
train_df1.drop(axis=1,columns=['Unnamed: 0'],inplace=True)
train_df2.drop(axis=1,columns=['Unnamed: 0'],inplace=True)
train_df1 = train_df1[train_df1['generated']==1]
train_df2 = train_df2[train_df2['generated']==1]
augmentation = pd.concat([train_df1,train_df2])
for i in range(210):
    essays_list[i] = clean_text(essays_list[i])
    if (i < 40):
        new_row = {'prompt_id':3, 'text':essays_list[i],'generated':1}
    else:
        new_row = {'prompt_id':4, 'text':essays_list[i],'generated':1}
    augmentation = augmentation.append(new_row, ignore_index=True)
'''

In [None]:
# augmentation.to_csv('augmentation.csv',index=False)

In [None]:
'''train_df = train_df.append(augmentation[(augmentation['prompt_id']==3)|(augmentation['prompt_id']==4)])
train_df.to_csv('train_df_after_C.csv',index=False)'''

In [None]:
train_df_filled = pd.read_csv('train_df_after_C.csv')
train_df_filled

## Re-train your best-performant classifier on the new data (or a careful selection of them) and analyze the benefits of using clustering to improve the classifier. 

In [None]:
X = train_df_filled.text
y = train_df_filled.generated
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
    
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
    
X_train_array = X_train_tfidf.toarray()
X_test_array = X_test_tfidf.toarray()
    
svm_model = LinearSVC(random_state=42)
svm_model.fit(X_train_array, y_train)
    
y_pred = svm_model.predict(X_test_array)
accuracy = accuracy_score(y_test, y_pred)  
f1 = f1_score(y_test, y_pred) 
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")