## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import cohen_kappa_score

In [None]:
# os.remove("/kaggle/working/submission.csv")
# os.remove("/kaggle/working/state.db")

## Data exploration

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

# display datasets

train_df1 = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')

In [None]:
train_df1['score'].dtypes

In [None]:
train_df1.head(10)

In [None]:
train_df1['score'].value_counts()

In [None]:
submission = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv")
submission

## Data Cleaning

In [None]:
cList = {
  "ain't": "am not","aren't": "are not","can't": "cannot","can't've": "cannot have","'cause": "because",  "could've": "could have","couldn't": "could not","couldn't've": "could not have","didn't": "did not","doesn't": "does not","don't": "do not","hadn't": "had not","hadn't've": "had not have","hasn't": "has not",
  "haven't": "have not","he'd": "he would","he'd've": "he would have","he'll": "he will","he'll've": "he will have","he's": "he is",
  "how'd": "how did","how'd'y": "how do you","how'll": "how will","how's": "how is","I'd": "I would","I'd've": "I would have","I'll": "I will","I'll've": "I will have","I'm": "I am","I've": "I have",
  "isn't": "is not","it'd": "it had","it'd've": "it would have","it'll": "it will", "it'll've": "it will have","it's": "it is","let's": "let us","ma'am": "madam","mayn't": "may not",
  "might've": "might have","mightn't": "might not","mightn't've": "might not have","must've": "must have","mustn't": "must not","mustn't've": "must not have","needn't": "need not","needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not","oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
  "shan't've": "shall not have","she'd": "she would","she'd've": "she would have","she'll": "she will","she'll've": "she will have","she's": "she is",
  "should've": "should have","shouldn't": "should not","shouldn't've": "should not have","so've": "so have","so's": "so is","that'd": "that would","that'd've": "that would have","that's": "that is","there'd": "there had","there'd've": "there would have","there's": "there is","they'd": "they would","they'd've": "they would have","they'll": "they will","they'll've": "they will have","they're": "they are","they've": "they have","to've": "to have","wasn't": "was not","we'd": "we had",
  "we'd've": "we would have","we'll": "we will","we'll've": "we will have","we're": "we are","we've": "we have",
  "weren't": "were not","what'll": "what will","what'll've": "what will have",
  "what're": "what are","what's": "what is","what've": "what have","when's": "when is","when've": "when have",
  "where'd": "where did","where's": "where is","where've": "where have","who'll": "who will","who'll've": "who will have","who's": "who is","who've": "who have","why's": "why is",
  "why've": "why have","will've": "will have","won't": "will not","won't've": "will not have","would've": "would have","wouldn't": "would not",
  "wouldn't've": "would not have","y'all": "you all","y'alls": "you alls","y'all'd": "you all would",
  "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you had","you'd've": "you would have","you'll": "you you will","you'll've": "you you will have","you're": "you are",  "you've": "you have"
   }

In [None]:
c_re = re.compile('(%s)' % '|'.join(cList.keys()))

In [None]:
def expandContractions(text, c_re=c_re):
    def replace(match):
        return cList[match.group(0)]
    return c_re.sub(replace, text)

In [None]:
def removeHTML(x):
    html=re.compile(r'<.*?>')
    return html.sub(r'',x)

In [None]:
def dataPreprocessing(x):
    x = x.apply(lambda s: s.lower()) 
    x = x.apply(removeHTML)
    x = x.apply(lambda s: re.sub("@\w+", '', s))  
    x = x.apply(lambda s: re.sub("'\d+", '', s))
    x = x.apply(lambda s: re.sub("\d+", '',s))
    x = x.apply(lambda s: re.sub("http\w+", '',s))
    x = x.apply(lambda s: re.sub(r"\s+", " ", s))
    x = x.apply(expandContractions)
    x = x.apply(lambda s: re.sub(r"\.+", ".", s))
    x = x.apply(lambda s: re.sub(r"\,+", ",", s))
    x = x.apply(lambda s: re.sub('\n', '',s))
    x = x.apply(lambda s: re.sub('[^\w\s]', '',s))
    x = x.apply(lambda s: s.strip()) 
    return x

In [None]:
x = dataPreprocessing(train_df1['full_text'])

In [None]:
x

In [None]:
test_df1 = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')

In [None]:
x0 = dataPreprocessing(test_df1['full_text'])

## Splitting data into training and testing sets

In [None]:
y = train_df1.iloc[:, 2:8] 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=123)

In [None]:
y_train

In [None]:
X_train

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

## Instantiate TF-IDF VEctorizer 

In [None]:
text_vectorizer = TfidfVectorizer(
    stop_words='english',
    sublinear_tf=False,
    strip_accents='unicode',
    binary=True,
    analyzer='word',
    token_pattern=r'\w{2,}',  
    ngram_range=(1,1),
    norm='l1', 
    use_idf=False, 
    smooth_idf=False,
    max_features=600000,
    min_df=30)

In [None]:
X_train_features = text_vectorizer.fit_transform(X_train)

In [None]:
X_train_features

In [None]:
test_features = text_vectorizer.transform(X_test)

## Train features into the Support Vector Machine

In [None]:
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import MaxAbsScaler
# from sklearn.svm import SVC
    
# clf = make_pipeline(MaxAbsScaler(), SVC(C=1.75, kernel='rbf', gamma='scale', decision_function_shape='ovr', random_state=123, tol=1e-5, shrinking=True, verbose=True, break_ties=True))
# clf.fit(X_train_features, y_train.values.ravel())
# y_pred = clf.predict(test_features)  

In [None]:
# print("Number of dimensions: ", test_features.ndim)  
# print("Shape of the array: ", test_features.shape)  

## Bagging Classifier + SVM

In [None]:
from sklearn import metrics  
from sklearn.ensemble import BaggingClassifier  
from sklearn.svm import SVC  

# fit a Bagging model to the data  
model = BaggingClassifier(estimator=SVC(C=1.75, kernel='rbf', gamma='scale', decision_function_shape='ovr', random_state=123, tol=1e-5, shrinking=True, verbose=True, break_ties=True), n_estimators=10, random_state=123, verbose=3) 
model.fit(X_train_features, y_train.values.ravel())
y_pred_bag = model.predict(test_features)

## Model Evaluation

In [None]:
print(confusion_matrix(y_test.values.ravel(), y_pred_bag.ravel()))

In [None]:
print(classification_report(y_test.values.ravel(), y_pred_bag.ravel()))

In [None]:
kappa = cohen_kappa_score(y_test.values.ravel(), y_pred_bag.ravel(), weights='quadratic')  
print('Cohen\'s kappa score: ', kappa)  

## Predict test features using the trained classifier (clf)

In [None]:
st_features = text_vectorizer.transform(x0)

test_predictions = model.predict(st_features)

## Save submission into a CSV file

In [None]:
submission = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv")
submission['score'] = test_predictions
submission.to_csv("submission.csv", index=False)
display(submission)