## Let us find the topic of these essays

A good essay must have relevance to the topic and it should demonstrate domain knowledge. 



In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
train_df = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv")#, index_col="essay_id")
test_df = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv")#, index_col="essay_id")




In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

tfidf = TfidfVectorizer(
    min_df = 5,
    max_df = 0.95,
    max_features = 8000,
    stop_words = 'english'
)
tfidf.fit(train_df.full_text)
text = tfidf.transform(train_df.full_text)


In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.cluster import MiniBatchKMeans
def find_optimal_clusters(data, max_k):
    iters = range(1, max_k+1, 1)
    
    sse = []
    for k in iters:
        sse.append(MiniBatchKMeans(n_clusters=k, init_size=1024, batch_size=2048, random_state=20).fit(data).inertia_)
        print('Fit {} clusters'.format(k))
        
    f, ax = plt.subplots(1, 1)
    ax.plot(iters, sse, marker='o')
    ax.set_xlabel('Cluster Centers')
    ax.set_xticks(iters)
    ax.set_xticklabels(iters)
    ax.set_ylabel('SSE')
    ax.set_title('SSE by Cluster Center Plot')
    
find_optimal_clusters(text, 20)

### It seems there are 7 clusters indicating 7 major topics being discussed at a broad level

In [None]:
clusters = MiniBatchKMeans(n_clusters=7, init_size=1024, batch_size=2048, random_state=20).fit_predict(text)
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import numpy as np

def plot_tsne_pca(data, labels):
    max_label = max(labels)
    max_items = np.random.choice(range(data.shape[0]), size=3000, replace=False)
    
    pca = PCA(n_components=2).fit_transform(data[max_items,:].toarray())
    tsne = TSNE().fit_transform(PCA(n_components=50).fit_transform(data[max_items,:].toarray()))
    
    
    idx = np.random.choice(range(pca.shape[0]), size=300, replace=False)
    label_subset = labels[max_items]
    label_subset = [cm.hsv(i/max_label) for i in label_subset[idx]]
    
    f, ax = plt.subplots(1, 2, figsize=(14, 6))
    
    ax[0].scatter(pca[idx, 0], pca[idx, 1], c=label_subset)
    ax[0].set_title('PCA Cluster Plot')
    
    ax[1].scatter(tsne[idx, 0], tsne[idx, 1], c=label_subset)
    ax[1].set_title('TSNE Cluster Plot')
    
plot_tsne_pca(text, clusters)

In [None]:
def get_top_keywords(data, clusters, labels, n_terms):
    df = pd.DataFrame(data.todense()).groupby(clusters).mean()
    
    for i,r in df.iterrows():
        print('\nCluster {}'.format(i))
        print(','.join([labels[t] for t in np.argsort(r)[-n_terms:]]))
            
get_top_keywords(text, clusters, tfidf.get_feature_names_out(), 50)



#### Cluster 0 - seems to be consisting of essays about Europe, adventures and sports 

#### Cluster 1 - seems to be consisting of essays about the safety aspects in driverless cars

#### Cluster 2 - seems to be consisting of essays about planets, high temperatures and pressure

#### Cluster 3 - seems to be consisting of essays about school, computers and human interaction/emotions in the computing era

#### Cluster 4 - seems to be consisting of essays about space, existence of aliens 

#### Cluster 5 - seems to be consisting of essays about elections 

#### Cluster 6 - seems to be consisting of essays about vehicles, environment and happiness


In [None]:
for i, r in train_df[90:100].iterrows():
    print(clusters[i], r['full_text'],)
    print("_________________________")
    