## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import cohen_kappa_score
from textblob import TextBlob
import nltk
from gensim import corpora, models 
# import matplotlib.pyplot as plt
# import seaborn as sns

In [None]:
# os.remove("/kaggle/working/submission.csv")
# os.remove("/kaggle/working/state.db")

## Data exploration

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

# display datasets

train_df0 = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')

In [None]:
train_df0['score'].dtypes

In [None]:
train_df0.head(10)

In [None]:
train_df0['score'].value_counts()

## Data Cleaning

In [None]:
cList = {
  "ain't": "am not","aren't": "are not","can't": "cannot","can't've": "cannot have","'cause": "because",  "could've": "could have","couldn't": "could not","couldn't've": "could not have","didn't": "did not","doesn't": "does not","don't": "do not","hadn't": "had not","hadn't've": "had not have","hasn't": "has not",
  "haven't": "have not","he'd": "he would","he'd've": "he would have","he'll": "he will","he'll've": "he will have","he's": "he is",
  "how'd": "how did","how'd'y": "how do you","how'll": "how will","how's": "how is","I'd": "I would","I'd've": "I would have","I'll": "I will","I'll've": "I will have","I'm": "I am","I've": "I have",
  "isn't": "is not","it'd": "it had","it'd've": "it would have","it'll": "it will", "it'll've": "it will have","it's": "it is","let's": "let us","ma'am": "madam","mayn't": "may not",
  "might've": "might have","mightn't": "might not","mightn't've": "might not have","must've": "must have","mustn't": "must not","mustn't've": "must not have","needn't": "need not","needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not","oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
  "shan't've": "shall not have","she'd": "she would","she'd've": "she would have","she'll": "she will","she'll've": "she will have","she's": "she is",
  "should've": "should have","shouldn't": "should not","shouldn't've": "should not have","so've": "so have","so's": "so is","that'd": "that would","that'd've": "that would have","that's": "that is","there'd": "there had","there'd've": "there would have","there's": "there is","they'd": "they would","they'd've": "they would have","they'll": "they will","they'll've": "they will have","they're": "they are","they've": "they have","to've": "to have","wasn't": "was not","we'd": "we had",
  "we'd've": "we would have","we'll": "we will","we'll've": "we will have","we're": "we are","we've": "we have",
  "weren't": "were not","what'll": "what will","what'll've": "what will have",
  "what're": "what are","what's": "what is","what've": "what have","when's": "when is","when've": "when have",
  "where'd": "where did","where's": "where is","where've": "where have","who'll": "who will","who'll've": "who will have","who's": "who is","who've": "who have","why's": "why is",
  "why've": "why have","will've": "will have","won't": "will not","won't've": "will not have","would've": "would have","wouldn't": "would not",
  "wouldn't've": "would not have","y'all": "you all","y'alls": "you alls","y'all'd": "you all would",
  "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you had","you'd've": "you would have","you'll": "you you will","you'll've": "you you will have","you're": "you are",  "you've": "you have"
   }

In [None]:
c_re = re.compile('(%s)' % '|'.join(cList.keys()))

In [None]:
def expandContractions(text, c_re=c_re):
    def replace(match):
        return cList[match.group(0)]
    return c_re.sub(replace, text)

In [None]:
def removeHTML(x):
    html=re.compile(r'<.*?>')
    return html.sub(r'',x)

In [None]:
def dataPreprocessing(x):
    x = x.apply(lambda s: s.lower()) 
    x = x.apply(removeHTML)
    x = x.apply(lambda s: re.sub("@\w+", '', s))  
    x = x.apply(lambda s: re.sub("'\d+", '', s))
    x = x.apply(lambda s: re.sub("\d+", '',s))
    x = x.apply(lambda s: re.sub("http\w+", '',s))
    x = x.apply(lambda s: re.sub(r"\s+", " ", s))
    x = x.apply(expandContractions)
    x = x.apply(lambda s: re.sub(r"\.+", ".", s))
    x = x.apply(lambda s: re.sub(r"\,+", ",", s))
    x = x.apply(lambda s: re.sub('\n', '',s))
    x = x.apply(lambda s: re.sub('[^\w\s]', '',s))
    x = x.apply(lambda s: s.strip()) 
    return x

In [None]:
x = dataPreprocessing(train_df0['full_text'])  

In [None]:
x

In [None]:
train_df0['full_text'] = x

In [None]:
test_df0 = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')

In [None]:
x0 = dataPreprocessing(test_df0['full_text'])

In [None]:
test_df0['full_text'] = x0

## Topic distribution for each essay (Feature Engineering 1)

In [None]:
 # assume documents is a list of strings  
documents = train_df0['full_text']  
  
# assume essay_ids is a list of IDs corresponding to the essays  
essay_ids = train_df0['essay_id']
  
# preprocess documents
texts = [doc.split() for doc in documents]  
  
# create a Gensim dictionary from the texts  
dictionary = corpora.Dictionary(texts)  
  
# create a Gensim corpus from the texts  
corpus = [dictionary.doc2bow(text) for text in texts]  
  
# train the LDA model on the corpus  
lda = models.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=15, random_state=123, alpha=0.01)  
  
# get the topic distribution for each document  
topic_dist = [lda.get_document_topics(bow) for bow in corpus]  
  
# initialize a dataframe  
train_df1 = pd.DataFrame(columns=['essay_id'] + ['topic{}'.format(i) for i in range(10)])  
  
for i, topics in enumerate(topic_dist):  
    # topics is a list of (topic_id, topic_probability) pairs  
    topic_prob = [0]*10  
    for topic, prob in topics:  
        topic_prob[topic] = prob  
  
    train_df1.loc[i] = [essay_ids[i]] + topic_prob 

In [None]:
topic_train = lda.show_topics(num_topics=10, num_words=10, formatted=False)

In [None]:
print(topic_train)

In [None]:
train_df1.head(10)

In [None]:
# assume documents is a list of strings 
documents_t = test_df0['full_text']  
  
# assume essay_ids is a list of IDs corresponding to the essays  
essay_ids_t = test_df0['essay_id']
  
# preprocess documents
texts_t = [doc.split() for doc in documents_t]  
  
# create a Gensim dictionary from the texts  
dictionary_t = corpora.Dictionary(texts_t)  
  
# create a Gensim corpus from the texts  
corpus_t = [dictionary_t.doc2bow(text) for text in texts_t]  
  
# train the LDA model on the corpus  
lda_t = models.LdaModel(corpus_t, num_topics=10, id2word=dictionary_t, passes=15, random_state=123, alpha=0.01)  
  
# get the topic distribution for each document  
topic_dist_t = [lda_t.get_document_topics(bow) for bow in corpus_t]  
  
# initialize a dataframe  
test_df1 = pd.DataFrame(columns=['essay_id'] + ['topic{}'.format(i) for i in range(10)])  
  
for i, topics in enumerate(topic_dist_t):  
    # topics is a list of (topic_id, topic_probability) pairs  
    topic_prob = [0]*10  
    for topic, prob in topics:  
        topic_prob[topic] = prob  
  
    test_df1.loc[i] = [essay_ids_t[i]] + topic_prob 

In [None]:
topic_test = lda_t.show_topics(num_topics=10, num_words=10, formatted=False)

In [None]:
print(topic_test)

In [None]:
test_df1.head(10)

## Essay Characteristics (Feature Engineering 2)

In [None]:
def lexical_diversity(text):  
    return len(set(text)) / len(text)  
  
def get_metrics(text):  
    blob = TextBlob(text)  
      
    # Number of words  
    num_words = len(blob.words)  
      
    # Lexical diversity  
    lex_div = lexical_diversity(blob.words)  
      
    # Average sentence length  
    avg_sent_length = sum(len(sentence.words) for sentence in blob.sentences) / len(blob.sentences)  
      
    # Number of adjectives and adverbs  
    tags = blob.tags  
    num_adj = len([word for word, tag in tags if tag == 'JJ'])  
    num_adv = len([word for word, tag in tags if tag == 'RB'])  
    num_noun = len([word for word, tag in tags if tag == 'NN'])  
      
    return num_words, lex_div, avg_sent_length, num_adj, num_adv, num_noun

In [None]:
# Set 'essay_id' as the index of the DataFrame  
train_df0.set_index('essay_id', inplace=True)  
  
for idx, row in train_df0.iterrows():  
    num_words, lex_div, avg_sent_length, num_adj, num_adv, num_noun = get_metrics(str(row['full_text']))  
  
    train_df0.loc[idx, 'num_words'] = num_words  
    train_df0.loc[idx, 'lex_div'] = lex_div  
    train_df0.loc[idx, 'avg_sent_length'] = avg_sent_length  
    train_df0.loc[idx, 'num_adj'] = num_adj  
    train_df0.loc[idx, 'num_adv'] = num_adv  
    train_df0.loc[idx, 'num_noun'] = num_noun 

In [None]:
test_df0.set_index('essay_id', inplace=True)  
  
for idx, row in test_df0.iterrows():  
    num_words, lex_div, avg_sent_length, num_adj, num_adv, num_noun = get_metrics(str(row['full_text']))  
  
    test_df0.loc[idx, 'num_words'] = num_words  
    test_df0.loc[idx, 'lex_div'] = lex_div  
    test_df0.loc[idx, 'avg_sent_length'] = avg_sent_length  
    test_df0.loc[idx, 'num_adj'] = num_adj  
    test_df0.loc[idx, 'num_adv'] = num_adv  
    test_df0.loc[idx, 'num_noun'] = num_noun 

## Merge both dfs based on essay ids

In [None]:
merged_train_df = pd.merge(train_df0, train_df1, on='essay_id', how='inner')

In [None]:
merged_train_df

In [None]:
merged_test_df = pd.merge(test_df0, test_df1, on='essay_id', how='inner')

In [None]:
merged_test_df

## EDA

In [None]:
# plt.close()
# sns.set_style('whitegrid')
# sns.pairplot(train_df1, hue='score', height=5)
# plt.show()

## Splitting data into training and testing sets

In [None]:
X = merged_train_df.drop(['score'], axis=1)  
y = merged_train_df['score']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=123)

In [None]:
y_train

In [None]:
X_train

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

## Instantiate TF-IDF VEctorizer 

In [None]:
text_vectorizer = TfidfVectorizer(
    stop_words='english',
    sublinear_tf=False,
    strip_accents='unicode',
    binary=True,
    analyzer='word',
    token_pattern=r'\w{2,}',  
    ngram_range=(1,1),
    norm='l1', 
    use_idf=False, 
    smooth_idf=False,
    max_features=600000,
    min_df=30)

In [None]:
# Fit and transform the text data to tf-idf  
text_vectorizer.fit(X_train['full_text'])  
X_train_tfidf = text_vectorizer.transform(X_train['full_text'])  
X_test_tfidf = text_vectorizer.transform(X_test['full_text'])  
  
# Convert the tf-idf vectors into a DataFrame  
X_train_tfidf_df = pd.DataFrame(X_train_tfidf.toarray(), index=X_train.index)  
X_test_tfidf_df = pd.DataFrame(X_test_tfidf.toarray(), index=X_test.index)  
  
# Concatenate the tf-idf DataFrame with the original DataFrame  
X_train_combined = pd.concat([X_train, X_train_tfidf_df], axis=1)  
X_test_combined = pd.concat([X_test, X_test_tfidf_df], axis=1) 

X_train_final = X_train_combined.drop(['full_text', 'essay_id'], axis=1)  
X_test_final = X_test_combined.drop(['full_text', 'essay_id'], axis=1)  

In [None]:
X_test_final

In [None]:
X_train_final.columns = X_train_final.columns.astype(str)  
X_test_final.columns = X_test_final.columns.astype(str)  

## Train features into the Support Vector Machine

In [None]:
from sklearn.pipeline import make_pipeline  
from sklearn.preprocessing import MaxAbsScaler  
from sklearn.svm import SVC  
  
clf = make_pipeline(MaxAbsScaler(), SVC(C=1.75, kernel='rbf', gamma='scale', decision_function_shape='ovr', random_state=123, tol=1e-5, shrinking=True, verbose=True, break_ties=True))  
clf.fit(X_train_final, y_train.values.ravel())  
y_pred = clf.predict(X_test_final)

## Model Evaluation

In [None]:
print(confusion_matrix(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
kappa = cohen_kappa_score(y_test, y_pred, weights='quadratic')  
print('Cohen\'s kappa score: ', kappa)  

## Predict test features using the trained classifier (clf)

In [None]:
test_tfidf = text_vectorizer.transform(merged_test_df['full_text'])  
  
# Convert the tf-idf vectors into a DataFrame  
test_tfidf_df = pd.DataFrame(test_tfidf.toarray(), index=merged_test_df.index)  

# Concatenate the tf-idf DataFrame with the original DataFrame  
test_combined = pd.concat([merged_test_df, test_tfidf_df], axis=1)  

test_final = test_combined.drop(['full_text','essay_id'], axis=1)  

test_final.columns = test_final.columns.astype(str) 

In [None]:
test_predictions = clf.predict(test_final)

## Save submission into a CSV file

In [None]:
submission = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv")
submission['score'] = test_predictions
submission.to_csv("submission.csv", index=False)
display(submission)