In [None]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_samples
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.cm as cm
import numpy as np
# The path to your directory
directory = ''

# Initialize a list to hold all the dataframes
dfs = []

# Walk through every folder in the directory
for root, dirs, files in os.walk(directory):
    for file in files:
        # Check if the file is a csv and has the correct name
        if file.endswith('.csv') and file.startswith('ThemesandQuotevirtualreality'):
            # Construct the full path to the file
            file_path = os.path.join(root, file)
            # Read the csv file, skipping the first row
            df = pd.read_csv(file_path, skiprows=1, names=["themes", "quotational evidence"])
            # Append the dataframe to the list
            dfs.append(df)

# Concatenate all dataframes in the list
combined_df = pd.concat(dfs, ignore_index=True)

# Now combined_df contains the combined data from all the csv files
print(combined_df)
combined_df.to_csv('combined.csv', index=False)

In [None]:
# Read the csv file
df = pd.read_csv('combined.csv')

# Create a mask where rows containing the words "sorry", "apologies" or "apologize" in the "quotational evidence" column are marked as True
mask = df['quotational evidence'].str.contains('sorry|apologies|apologize', case=False, na=False)

# Invert the mask, so that rows to be removed are marked as False
mask = ~mask

# Apply the mask to the dataframe to remove unwanted rows
df = df[mask]

# Optional: Save the modified dataframe back to csv
df.to_csv('combined.csv', index=False)

In [None]:
data = pd.read_csv('combined.csv')

quotations = data['quotational evidence']

# tf-idf feature matrix
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(quotations)

# cosine similarity for rows
cosine_sim = cosine_similarity(tfidf_matrix)

# agglomerative Clustering
cluster = AgglomerativeClustering(n_clusters=None, metric='precomputed', linkage='average', distance_threshold=0.93)
cluster_result = cluster.fit_predict(1-cosine_sim)

# create linkage matrix
linkage_matrix = linkage(1-cosine_sim, 'average')

# plot the dendrogram
plt.figure(figsize=(15, 7))
dendrogram(linkage_matrix)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
plt.xticks(rotation=90)  # rotate x-axis labels by 90 degrees
plt.tight_layout()
plt.show()

data['Cluster'] = cluster_result

# silhouette score for each sample
silhouette_vals = silhouette_samples(tfidf_matrix, cluster_result)

# create a dataframe of silhouette scores
silhouette_df = pd.DataFrame()
silhouette_df['Cluster'] = cluster_result
silhouette_df['Silhouette Score'] = silhouette_vals

# calculate average silhouette score for each cluster
average_scores = silhouette_df.groupby('Cluster')['Silhouette Score'].mean()

# map average silhouette score to each sample in the original dataframe
data['Silhouette Score'] = data['Cluster'].map(average_scores.to_dict())

data = data.sort_values('Cluster')
data.to_csv('clusteredcombined.csv', index=False)
print('Data has been clustered, sorted, and saved')


In [None]:
plt.figure(figsize=(10, 40))

y_lower = 10
n_clusters = len(np.unique(cluster_result))

for i in range(n_clusters):
    ith_cluster_silhouette_values = silhouette_vals[cluster_result == i]
    ith_cluster_silhouette_values.sort()

    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i

    color = cm.nipy_spectral(float(i) / n_clusters)
    plt.fill_betweenx(np.arange(y_lower, y_upper),
                      0, ith_cluster_silhouette_values,
                      facecolor=color, edgecolor=color, alpha=0.7)

    plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

    y_lower = y_upper + 10

plt.title("The silhouette plot for the various clusters.")
plt.xlabel("The silhouette coefficient values")
plt.ylabel("Cluster label")

# The vertical line for average silhouette score of all the values
average_score = np.mean(silhouette_vals)
plt.axvline(x=average_score, color="red", linestyle="--")

plt.yticks([])
plt.xticks(np.arange(-0.1, 1.1, 0.1))
plt.show()


In [None]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('/Users/taran/Documents/Mentorship/clusteredcombined.csv')

# Analyze the 'Cluster' column
cluster_counts = df['Cluster'].value_counts()

# Sort values from most to least common
cluster_counts_sorted = cluster_counts.sort_values(ascending=False)
with pd.option_context('display.max_rows', None):
    print(cluster_counts_sorted)
