# Model

In [2]:
from sklearn.metrics import ndcg_score
import numpy as np
from scipy import stats

def normalize(p, q):
    p, q = np.asarray(p), np.asarray(q)
    assert (p >= 0).all(), p
    assert (q >= 0).all()
    p, q = p / p.sum(), q / q.sum()
    return p, q

def JSD(p, q, base=2):
    p, q = normalize(p, q)
    m = 1. / 2 * (p + q)
    return stats.entropy(p, m, base=base) / 2. + stats.entropy(q, m, base=base) / 2.

def rnorm_sum_squares(x, y):
    x = np.array(x)
    y = np.array(y)
    assert x.shape == y.shape
    numerator = np.sum(np.square(x - y))
    denominator = np.sum(np.square(x) + np.square(y))
    rnorm_ss = np.sqrt(numerator / denominator)
    return rnorm_ss

def print_metrics(y_true, y_pred):
    metric = []
    ndcg=ndcg_score([y_true], [y_pred])
    Rjsd=1-JSD(y_true, y_pred, base=2) 
    RRNSS = 1-rnorm_sum_squares(y_true, y_pred)
    metric.append(ndcg)
    metric.append(Rjsd)
    metric.append(RRNSS)
    return metric  

In [3]:
def discretize(y_pred):    
    for i in range(len(y_pred)):    
        if y_pred[i] < 0.5:
            y_pred[i] = 0.0
        elif y_pred[i] < 1.5:
            y_pred[i] = 1.0
        elif y_pred[i] < 2.5:
            y_pred[i] = 2.0
        elif y_pred[i] < 3.5:
            y_pred[i] = 3.0
        else:
            y_pred[i] = 4.0            
    return y_pred

xgboost

In [9]:
from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

class GloVeFeatXGBoost():
    def __init__(self, embedding_size=300, n_estimators=100, max_depth=3, learning_rate=0.1):
        self.glove_file = 'G:/L2\glove\glove.6B.300d.txt'
        self.glove_model = self.load_glove_model(self.glove_file)
        self.embedding_size = embedding_size
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.clf = XGBRegressor(
            n_estimators=self.n_estimators,
            max_depth=self.max_depth,
            learning_rate=self.learning_rate
        )

    def load_glove_model(self, glove_file):
        model = {}
        with open(glove_file, 'r', encoding='utf-8') as f:
            for line in f:
                split_line = line.strip().split()
                word = split_line[0]
                embedding = np.array([float(val) for val in split_line[1:]])
                model[word] = embedding
        return model

    def _get_sentence_embedding(self, sentence):
        vecs = [self.glove_model.get(word, np.zeros(self.embedding_size)) for word in sentence]
        vecs = np.array(vecs)
        return np.mean(vecs, axis=0)

    def train(self, X_train_text, X_train_feat, y_train):
        X_train_vecs = np.array([self._get_sentence_embedding(sentence) for sentence in X_train_text])
        X_train_vecs = np.hstack((X_train_vecs, X_train_feat))
        self.clf.fit(X_train_vecs, y_train)

    def predict(self, X_test_text, X_test_feat):
        X_test_vecs = np.array([self._get_sentence_embedding(sentence) for sentence in X_test_text])
        X_test_vecs = np.hstack((X_test_vecs, X_test_feat))
        return discretize(self.clf.predict(X_test_vecs))

    def evaluate(self, X_test_text, X_test_feat, y_test):
        y_pred = self.predict(X_test_text, X_test_feat)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')        
        return accuracy, precision, recall, f1

In [4]:
from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

class GloVeXGBoost():
    def __init__(self, embedding_size=300, n_estimators=100, max_depth=3, learning_rate=0.1):
        self.glove_file = 'G:/L2\glove\glove.6B.300d.txt'
        self.glove_model = self.load_glove_model(self.glove_file)
        self.embedding_size = embedding_size
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.clf = XGBRegressor(
            n_estimators=self.n_estimators,
            max_depth=self.max_depth,
            learning_rate=self.learning_rate
        )

    def load_glove_model(self, glove_file):
        model = {}
        with open(glove_file, 'r', encoding='utf-8') as f:
            for line in f:
                split_line = line.strip().split()
                word = split_line[0]
                embedding = np.array([float(val) for val in split_line[1:]])
                model[word] = embedding
        return model

    def _get_sentence_embedding(self, sentence):
        sentence_str = ' '.join(sentence)
        vecs = [self.glove_model[word] for word in sentence_str.split() if word in self.glove_model]
        vecs = np.array(vecs)
        return np.mean(vecs, axis=0)

    def train(self, X_train, y_train):
        X_train_vecs = np.array([self._get_sentence_embedding(sentence) for sentence in X_train])
        self.clf.fit(X_train_vecs, y_train)

    def predict(self, X_test):
        X_test_vecs = np.array([self._get_sentence_embedding(sentence) for sentence in X_test])
        return discretize(self.clf.predict(X_test_vecs))

    def evaluate(self, X_test, y_test):
        y_pred = self.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')        
        return accuracy, precision, recall, f1

SVM

In [15]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

class GloVeFeatSVM():
    def __init__(self,  kernel='linear', C=1):
        # Load the GloVe word vector model
        self.glove_file='G:/L2\glove\glove.6B.300d.txt'
        self.glove_model = self.load_glove_model(self.glove_file)
        self.kernel = kernel
        self.C = C
        self.clf = SVC(kernel=self.kernel, C=self.C)

    def load_glove_model(self, glove_file):
        with open(glove_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            glove_model = {}
            for line in lines:
                values = line.split()
                word = values[0]
                coef = np.asarray(values[1:], dtype='float32')
                glove_model[word] = coef
        return glove_model
    
    def _get_sentence_embedding(self, sentence):
        sentence_str = ' '.join(sentence)
        vecs = [self.glove_model[word] for word in sentence_str.split() if word in self.glove_model]
        vecs = np.array(vecs)
        return np.mean(vecs, axis=0)
    
    def train(self, X_train, X_train_feat, y_train):
        train_vecs = []
        for text in X_train:
            train_vecs.append(self._get_sentence_embedding(text))
        X_train = np.array(train_vecs)
        scaler = StandardScaler()
        X_feat_train = scaler.fit_transform(X_train_feat)
        self.X_train = np.hstack((train_vecs, X_feat_train))
        self.y_train = y_train
        self.clf.fit(self.X_train, self.y_train)

    def predict(self, X_test,X_test_feat):
        test_vecs = []
        for text in X_test:
            test_vecs.append(self._get_sentence_embedding(text))
        test_vecs  = np.array(test_vecs)
        scaler = StandardScaler()
        X_feat_test = scaler.fit_transform(X_test_feat)
        X_test = np.hstack((test_vecs, X_feat_test))
        return discretize(self.clf.predict(X_test))

    def evaluate(self, X_test_text, X_test_feat, y_test):
        y_pred = self.predict(X_test_text, X_test_feat)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')        
        return accuracy, precision, recall, f1


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler

class GloVeSVM():
    def __init__(self,  kernel='linear', C=1):
        # Load the GloVe word vector model
        self.glove_file='G:\\../L2/glove/glove.6B.300d.txt'
        self.glove_model = self.load_glove_model(self.glove_file)
        self.kernel = kernel
        self.C = C
        self.clf = SVC(kernel=self.kernel, C=self.C)

    def load_glove_model(self, glove_file):
        with open(glove_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            glove_model = {}
            for line in lines:
                values = line.split()
                word = values[0]
                coef = np.asarray(values[1:], dtype='float32')
                glove_model[word] = coef
        return glove_model
    
    def _get_sentence_embedding(self, sentence):
        sentence_str = ' '.join(sentence)
        vecs = [self.glove_model[word] for word in sentence_str.split() if word in self.glove_model]
        vecs = np.array(vecs)
        return np.mean(vecs, axis=0)
    
    def train(self, X_train, y_train):
        train_vecs = []
        for text in X_train:
            train_vecs.append(self._get_sentence_embedding(text))
        X_train = np.array(train_vecs)
        y_train = np.array(y_train)
        self.clf.fit(X_train, y_train)


    def predict(self, X_test):
        test_vecs = []
        for text in X_test:
            test_vecs.append(self._get_sentence_embedding(text))
        test_vecs  = np.array(test_vecs)
        return discretize(self.clf.predict(test_vecs))

    def evaluate(self, X_test_text, y_test):
        y_pred = self.predict(X_test_text)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')        
        return accuracy, precision, recall, f1


# Datasets

In [5]:
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
from sklearn.model_selection import train_test_split
def get_train_test(feature,label):
    X_train, X_test, y_train, y_test=train_test_split(feature,label, test_size=0.2, random_state=42,stratify=label)
    X_train=np.array(X_train)
    X_test=np.array(X_test)
    y_train=np.array(y_train)
    y_train = y_train.ravel()
    y_test=np.array(y_test)
    y_test=y_test.ravel()
    return X_train, X_test, y_train, y_test

In [1]:
import os
os.getcwd()


'g:\\广大资料\\跨指标难度评估-丁汉师兄\\数据文件\\code_Cross-corpus readability assessment compatibility for English texts\\model'

In [13]:
# label-feature
import pandas as pd
############ CEFR #########################################
CEFR = pd.read_csv("../L2/feature/[CEFR]with_features.csv")
df =pd.DataFrame(CEFR)
label=df.iloc[:,1:2]
feature=df.iloc[:,2:23]
CEFR_feature_train, CEFR_feature_test, CEFR_featurey_train, CEFR_featurey_test=get_train_test(feature,label)
# ############ CLEC #######################################################################################
CLEC = pd.read_csv("../L2/feature/[CLEC]with_features.csv")
df =pd.DataFrame(CLEC)
label=df.iloc[:,1:2]
feature=df.iloc[:,2:23]
CLEC_feature_train, CLEC_feature_test, CLEC_featurey_train, CLEC_featurey_test=get_train_test(feature,label)
# ############ CLOTH #######################################################################################
CLOTH = pd.read_csv("../L2/feature/[CLOTH]with_features.csv")
df =pd.DataFrame(CLOTH)
label=df.iloc[:,1:2]
feature=df.iloc[:,2:23]
CLOTH_feature_train, CLOTH_feature_test, CLOTH_featurey_train, CLOTH_featurey_test=get_train_test(feature,label)
# ############ NES #######################################################################################
NES = pd.read_csv("../L2/feature/[NES]with_features.csv")
df =pd.DataFrame(NES)
label=df.iloc[:,1:2]
feature=df.iloc[:,2:23]
NES_feature_train, NES_feature_test, NES_featurey_train, NES_featurey_test=get_train_test(feature,label)
# ############ OSP #######################################################################################
OSP = pd.read_csv("../L2/feature/[OSP]with_features.csv")
df =pd.DataFrame(OSP)
label=df.iloc[:,1:2]
feature=df.iloc[:,2:23]
OSP_feature_train, OSP_feature_test, OSP_featurey_train, OSP_featurey_test=get_train_test(feature,label)
# ############ RACE #######################################################################################
RACE = pd.read_csv("../L2/feature/[RACE]with_features.csv")
df =pd.DataFrame(RACE)
label=df.iloc[:,1:2]
feature=df.iloc[:,2:23]
RACE_feature_train, RACE_feature_test, RACE_featurey_train, RACE_featurey_test=get_train_test(feature,label)

In [6]:
# label-text
############ CEFR ##################################################################
CEFR = pd.read_csv("../L2/feature/[CEFR]with_features.csv")
df =pd.DataFrame(CEFR)
label=df.iloc[:,1:2]
text=df.iloc[:,0:1]
CEFR_X_train, CEFR_X_test, CEFR_y_train, CEFR_y_test=get_train_test(text,label)
# ############ CLEC #################################################################
CLEC = pd.read_csv("../L2/feature/[CLEC]with_features.csv")
df =pd.DataFrame(CLEC)
label=df.iloc[:,1:2]
text=df.iloc[:,0:1]
CLEC_X_train, CLEC_X_test, CLEC_y_train, CLEC_y_test=get_train_test(text,label)
# ############ CLOTH ################################################################
CLOTH = pd.read_csv("../L2/feature/[CLOTH]with_features.csv")
df =pd.DataFrame(CLOTH)
label=df.iloc[:,1:2]
text=df.iloc[:,0:1]
CLOTH_X_train, CLOTH_X_test, CLOTH_y_train, CLOTH_y_test=get_train_test(text,label)
# ############ NES ##################################################################
NES = pd.read_csv("../L2/feature/[NES]with_features.csv")
df =pd.DataFrame(NES)
label=df.iloc[:,1:2]
text=df.iloc[:,0:1]
NES_X_train, NES_X_test, NES_y_train, NES_y_test=get_train_test(text,label)
# ############ OSP ##################################################################
OSP = pd.read_csv("../L2/feature/[OSP]with_features.csv")
df =pd.DataFrame(OSP)
label=df.iloc[:,1:2]
text=df.iloc[:,0:1]
OSP_X_train, OSP_X_test, OSP_y_train, OSP_y_test=get_train_test(text,label)
# ############ RACE #################################################################
RACE = pd.read_csv("../L2/feature/[RACE]with_features.csv")
df =pd.DataFrame(RACE)
label=df.iloc[:,1:2]
text=df.iloc[:,0:1]
RACE_X_train, RACE_X_test, RACE_y_train, RACE_y_test=get_train_test(text,label)


# 2. Train model

XGBoost+glove+feature

In [None]:
datasets = {'CEFR': [CEFR_X_train,CEFR_y_train,CEFR_X_test,CEFR_y_test,CEFR_feature_train, CEFR_feature_test],
            'CLEC': [CLEC_X_train,CLEC_y_train,CLEC_X_test,CLEC_y_test,CLEC_feature_train, CLEC_feature_test],
            'CLOTH': [CLOTH_X_train,CLOTH_y_train,CLOTH_X_test,CLOTH_y_test,CLOTH_feature_train, CLOTH_feature_test],
            'NES': [NES_X_train,NES_y_train,NES_X_test,NES_y_test,NES_feature_train, NES_feature_test],
            'OSP': [OSP_X_train,OSP_y_train,OSP_X_test,OSP_y_test,OSP_feature_train, OSP_feature_test],
            'RACE': [RACE_X_train,RACE_y_train,RACE_X_test,RACE_y_test,RACE_feature_train, RACE_feature_test]}

all_res = []
for data in datasets:
    X_train, y_train, X_test, y_test ,feature_train,feature_test= datasets[data]
    model = GloVeFeatXGBoost()
    model.train(X_train,feature_train, y_train)
    accuracy, precision, recall, f1 = model.evaluate(X_test,feature_test, y_test)
    res = [data, accuracy, precision, recall, f1]
    all_res.append(res)
print(all_res)


In [14]:
datasetname=['CEFR','CLEC','CLOTH','NES','OSP','RACE']
trainfeat=[[CEFR_X_train,CEFR_feature_train,CEFR_y_train],
       [CLEC_X_train,CLEC_feature_train,CLEC_y_train],
       [CLOTH_X_train,CLOTH_feature_train,CLOTH_y_train],
       [NES_X_train,NES_feature_train,NES_y_train],
       [OSP_X_train,OSP_feature_train,OSP_y_train]]
testsfeat=[[CEFR_X_test,CEFR_feature_test,CEFR_y_test],
       [CLEC_X_test,CLEC_feature_test,CLEC_y_test],
       [CLOTH_X_test,CLOTH_feature_test,CLOTH_y_test],
       [NES_X_test,NES_feature_test,NES_y_test],
       [OSP_X_test,OSP_feature_test,OSP_y_test]]

i=-1
for X_train, X_train_feat, y_train in trainfeat:
    i=i+1
    j=-1
    model = GloVeFeatXGBoost()
    model.train(X_train, X_train_feat, y_train)
    cor_metric = []
    result = {}
    for X_test,X_test_feat,y_test in testsfeat:
        j=j+1
        y_pred = model.predict(X_test, X_test_feat)
        result['Numy'] = y_test
        result['Numyp'] = y_pred
        df = pd.DataFrame(result)
        cor=print_metrics(y_test, y_pred)
        cor_metric.append(cor)
        df.to_csv('G:\\ML_result\\ML+glove+feature\\'+datasetname[i]+'_'+datasetname[j]+'_xgboost.txt',header=False, index=False,sep=' ')
    cor = pd.DataFrame(columns=['NDCG','RJSD','RRNSS'], data=cor_metric)
    cor.to_csv('G:\\ML_result\\ML+glove+feature\\cor\\cor_xgboost_'+datasetname[i]+'.csv')
    

XGBoost+glove

In [None]:
all_res = []
for data in datasets:
    X_train, y_train, X_test, y_test ,feature_train,feature_test= datasets[data]
    model = GloVeXGBoost()
    model.train(X_train, y_train)
    accuracy, precision, recall, f1 = model.evaluate(X_test, y_test)
    res = [data, accuracy, precision, recall, f1]
    all_res.append(res)
print(all_res)

In [7]:
train=[[CEFR_X_train,CEFR_y_train],[CLEC_X_train,CLEC_y_train],[CLOTH_X_train,CLOTH_y_train],[NES_X_train,NES_y_train],[OSP_X_train,OSP_y_train],[RACE_X_train,RACE_y_train]]
tests=[[CEFR_X_test,CEFR_y_test],[CLEC_X_test,CLEC_y_test],[CLOTH_X_test,CLOTH_y_test],[NES_X_test,NES_y_test],[OSP_X_test,OSP_y_test],[RACE_X_test,RACE_y_test]]

i=-1
for X_train, y_train in train:
    i=i+1
    j=-1
    model = GloVeXGBoost()
    model.train(X_train, y_train)
    cor_metric = []
    result = {}
    for X_test,y_test in tests:
        j=j+1
        y_pred = model.predict(X_test)
        result['Numy'] = y_test
        result['Numyp'] = y_pred
        df = pd.DataFrame(result)
        cor=print_metrics(y_test, y_pred)
        cor_metric.append(cor)
        df.to_csv('G:\\ML_result\\ML+glove-new\\'+datasetname[i]+'_'+datasetname[j]+'_xgboost.txt',header=False, index=False,sep=' ')
    cor = pd.DataFrame(columns=['NDCG','RJSD','RRNSS'], data=cor_metric)
    cor.to_csv('G:\\ML_result\\ML+glove-new\\cor\\cor_xgboost_'+datasetname[i]+'.csv')
    

SVM+glove+feature

In [None]:
all_res = []
for data in datasets:
    X_train, y_train, X_test, y_test ,feature_train,feature_test= datasets[data]
    model = GloVeFeatSVM()
    model.train(X_train,feature_train, y_train)
    accuracy, precision, recall, f1 = model.evaluate(X_test,feature_test, y_test)
    res = [data, accuracy, precision, recall, f1]
    all_res.append(res)

print(all_res)


In [16]:
i=-1
for X_train, X_train_feat, y_train in trainfeat:
    i=i+1
    j=-1
    model = GloVeFeatSVM()
    model.train(X_train, X_train_feat, y_train)
    cor_metric = []
    result = {}
    for X_test,X_test_feat,y_test in testsfeat:
        j=j+1
        y_pred = model.predict(X_test, X_test_feat)
        result['Numy'] = y_test
        result['Numyp'] = y_pred
        df = pd.DataFrame(result)
        cor=print_metrics(y_test, y_pred)
        cor_metric.append(cor)
        df.to_csv('G:\\ML_result\\ML+glove+feature\\'+datasetname[i]+'_'+datasetname[j]+'_SVM.txt',header=False, index=False,sep=' ')
    cor = pd.DataFrame(columns=['NDCG','RJSD','RRNSS'], data=cor_metric)
    cor.to_csv('G:\\ML_result\\ML+glove+feature\\cor\\cor_SVM_'+datasetname[i]+'.csv')
    

SVM+glove

In [None]:
all_res = []
for data in datasets:
    X_train, y_train, X_test, y_test ,feature_train,feature_test= datasets[data]
    model = GloVeSVM()
    model.train(X_train, y_train)
    accuracy, precision, recall, f1 = model.evaluate(X_test, y_test)
    res = [data, accuracy, precision, recall, f1]
    all_res.append(res)
print(all_res)


In [None]:
i=-1
for X_train,  y_train in train:
    i=i+1
    j=-1
    model = GloVeSVM()
    model.train(X_train,  y_train)
    cor_metric = []
    result = {}
    for X_test,y_test in tests:
        j=j+1
        y_pred = model.predict(X_test)
        result['Numy'] = y_test
        result['Numyp'] = y_pred
        df = pd.DataFrame(result)
        cor=print_metrics(y_test, y_pred)
        cor_metric.append(cor)
        df.to_csv('G:\\ML_result\\ML+glove+feature\\'+datasetname[i]+'_'+datasetname[j]+'_SVM.txt',header=False, index=False,sep=' ')
    cor = pd.DataFrame(columns=['NDCG','RJSD','RRNSS'], data=cor_metric)
    cor.to_csv('G:\\ML_result\\ML+glove+feature\\cor\\cor_SVM_'+datasetname[i]+'.csv')
    