## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import os
import re
import string
from textblob import TextBlob
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import cohen_kappa_score

In [None]:
# os.remove("/kaggle/working/submission.csv")
# os.remove("/kaggle/working/state.db")

## Data exploration

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

# display datasets

train_df1 = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')

In [None]:
train_df1['score'].dtypes

In [None]:
train_df1.head(10)

In [None]:
train_df1['score'].value_counts()

## Data Cleaning

In [None]:
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have"}

In [None]:
# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

In [None]:
def removeHTML(x):
    html=re.compile(r'<.*?>')
    return html.sub(r'',x)

In [None]:
def dataPreprocessing(x):
    x = x.apply(expand_contractions)
    x = x.apply(lambda x: re.sub('\w*\d\w*','', x)) # Remove digits and words containing digits
    x = x.apply(lambda s: s.lower()) # lower case texts
    x = x.apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x)) # remove punctuations
    x = x.apply(lambda x: re.sub(' +',' ',x)) # remove extra space after remove punctuations
    x = x.apply(removeHTML)
    x = x.apply(lambda s: re.sub("@\w+", '', s))  
    x = x.apply(lambda s: re.sub("'\d+", '', s))
    x = x.apply(lambda s: re.sub("\d+", '',s))
    x = x.apply(lambda s: re.sub("http\w+", '',s))
    x = x.apply(lambda s: re.sub(r"\s+", " ", s))
    x = x.apply(lambda s: re.sub(r"\.+", ".", s))
    x = x.apply(lambda s: re.sub(r"\,+", ",", s))
    x = x.apply(lambda s: re.sub('\n', '',s))
    x = x.apply(lambda s: re.sub('[^\w\s]', '',s))
    x = x.apply(lambda s: s.strip()) 
    return x

In [None]:
x = dataPreprocessing(train_df1['full_text'])

In [None]:
train_df1['full_text'] = x

## Identify Essay Characteristics

In [None]:
# nltk.download('punkt')  
# nltk.download('averaged_perceptron_tagger')  
  
def lexical_diversity(text):  
    return len(set(text)) / len(text)  
  
def get_metrics(text):  
    blob = TextBlob(text)  
      
    # Number of words  
    num_words = len(blob.words)  
      
    # Lexical diversity  
    lex_div = lexical_diversity(blob.words)  
      
    # Average sentence length  
    avg_sent_length = sum(len(sentence.words) for sentence in blob.sentences) / len(blob.sentences)  
      
    # Number of adjectives and adverbs  
    tags = blob.tags  
    num_adj = len([word for word, tag in tags if tag == 'JJ'])  
    num_adv = len([word for word, tag in tags if tag == 'RB'])  
    num_noun = len([word for word, tag in tags if tag == 'NN'])  
      
    return num_words, lex_div, avg_sent_length, num_adj, num_adv, num_noun  
    
# num_words, lex_div, avg_sent_length, num_adj, num_adv, num_noun = get_metrics(str(train_df1['full_text']))
  
# print(f'Number of Words: {num_words}')  
# print(f'Lexical Diversity: {lex_div}')  
# print(f'Average Sentence Length: {avg_sent_length}')  
# print(f'Number of Adjectives: {num_adj}')  
# print(f'Number of Adverbs: {num_adv}')  
# print(f'Number of Nouns: {num_noun}')  

In [None]:
# Set 'essay_id' as the index of the DataFrame  
train_df1.set_index('essay_id', inplace=True)  
  
for idx, row in train_df1.iterrows():  
    num_words, lex_div, avg_sent_length, num_adj, num_adv, num_noun = get_metrics(str(row['full_text']))  
  
    train_df1.loc[idx, 'num_words'] = num_words  
    train_df1.loc[idx, 'lex_div'] = lex_div  
    train_df1.loc[idx, 'avg_sent_length'] = avg_sent_length  
    train_df1.loc[idx, 'num_adj'] = num_adj  
    train_df1.loc[idx, 'num_adv'] = num_adv  
    train_df1.loc[idx, 'num_noun'] = num_noun  

In [None]:
# train_df1.head(20)
train_df1.tail(20)

## Preparing Text Data for Exploratory Data Analysis (EDA)

In [None]:
# nlp = spacy.load('en_core_web_sm',disable=['parser', 'ner'])

In [None]:
# def dataPrep(x):
#     # remove stopwods and apply lemmatization
#     x = x.apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if (token.is_stop==False)]))
    
# dataPrep(x)
# dataPrep(x0)

In [None]:
# train_df1['full_text'] = x
# display(train_df1)

## Create a Document Term Matrix.

In [None]:
# tv = TfidfVectorizer(
#     stop_words='english',
#     sublinear_tf=False,
#     strip_accents='unicode',
#     binary=True,
#     analyzer='word',
#     token_pattern=r'\w{3,}',  
#     ngram_range=(3,6),
#     norm='l1', 
#     use_idf=False, 
#     smooth_idf=False,
#     max_features=9000000,
#     min_df=30)

In [None]:
# data = tv.fit_transform(train_df1['full_text'])  
# feature_names = tv.get_feature_names_out()  
  
# train_df1_tv = pd.DataFrame(data.toarray(), columns=feature_names)  
# train_df1_tv.index= train_df1.index  

In [None]:
# # Concatenate the original DataFrame with the TF-IDF DataFrame  
# train_df1_final = pd.concat([train_df1, train_df1_tv], axis=1) 

## Splitting data into training and testing sets

In [None]:
X = train_df1.drop(['score','full_text'], axis=1)  
y = train_df1['score']

In [None]:
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
y_train

In [None]:
X_train

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

## Instantiate TF-IDF VEctorizer 

In [None]:
# X_train_features = text_vectorizer.fit_transform(X_train)

In [None]:
# X_train_features

In [None]:
# test_features = text_vectorizer.transform(X_test)

## Train features into the Support Vector Machine

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler
from sklearn.svm import SVC
    
clf = make_pipeline(MaxAbsScaler(), SVC(C=1.75, kernel='rbf', gamma='scale', decision_function_shape='ovr', random_state=123, tol=1e-5, shrinking=True, verbose=True, break_ties=True))
clf.fit(X_train, y_train.values.ravel())
y_pred = clf.predict(X_test)  

In [None]:
print(confusion_matrix(y_test.values.ravel(), y_pred.ravel()))

In [None]:
print(classification_report(y_test.values.ravel(), y_pred.ravel()))

In [None]:
kappa = cohen_kappa_score(y_test.values.ravel(), y_pred.ravel(), weights='quadratic')  
print('Cohen\'s kappa score: ', kappa)  

## Bagging Classifier + SVM

In [None]:
# from sklearn import metrics  
# from sklearn.ensemble import BaggingClassifier  

# # fit a Bagging model to the data  
# model = BaggingClassifier(estimator=clf, n_estimators=10, random_state=123, verbose=3)
# model.fit(X_train_features, y_train.values.ravel())
# y_pred_bag = model.predict(test_features)

## Model Evaluation

In [None]:
# print(confusion_matrix(y_test.values.ravel(), y_pred_bag.ravel()))

In [None]:
# print(classification_report(y_test.values.ravel(), y_pred_bag.ravel()))

In [None]:
# kappa = cohen_kappa_score(y_test.values.ravel(), y_pred_bag.ravel(), weights='quadratic')  
# print('Cohen\'s kappa score: ', kappa)  

## Predict test features using the trained classifier (clf)

In [None]:
test_df1 = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')

In [None]:
test_df1

In [None]:
x0 = dataPreprocessing(test_df1['full_text'])

In [None]:
test_df1['full_text'] = x0

In [None]:
# Set 'essay_id' as the index of the DataFrame  
test_df1.set_index('essay_id', inplace=True)  
  
for idx, row in test_df1.iterrows():  
    num_words, lex_div, avg_sent_length, num_adj, num_adv, num_noun = get_metrics(str(row['full_text']))  
  
    test_df1.loc[idx, 'num_words'] = num_words  
    test_df1.loc[idx, 'lex_div'] = lex_div  
    test_df1.loc[idx, 'avg_sent_length'] = avg_sent_length  
    test_df1.loc[idx, 'num_adj'] = num_adj  
    test_df1.loc[idx, 'num_adv'] = num_adv  
    test_df1.loc[idx, 'num_noun'] = num_noun

In [None]:
# st_features = text_vectorizer.transform(x0)

Y_test_final = test_df1.drop('full_text', axis=1)  

test_predictions = clf.predict(Y_test_final)

## Save submission into a CSV file

In [None]:
submission = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv")
submission

In [None]:
submission['score'] = test_predictions
submission.to_csv("submission.csv", index=False)
display(submission)