# Classification experiments
This notebook contains the following experiments of classification for the Newsela and Britannica texts:
* For both data sets:
    1. Threeway classification of all texts.
    2. Pairwise classification of the same text.
    3. Pairwise classification of all texts.  
$~$
* Points 2. and 3. while training on Newsela and testing on Britannica.
$~$
* Point 3. for comparing two outlier removal methods.

#### Import requirements

In [1]:
import random
from itertools import combinations
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split

#### Load DFs and create variables for necessary feature sets

In [2]:
def load_data(file_name):
    df = pd.read_csv(file_name, index_col=0)
    return df

In [4]:
df_brit = load_data("csv/britannica_with_features.csv")
df_new = load_data("csv/newsela_with_features.csv")
sematch = load_data('csv/britannica_sematch_with_features.csv')
semrel = load_data('csv/britannica_semrel_with_features.csv')
short_brit = df_brit[:45]

In [5]:
all_features = [
    "node_degree",
    "clustering_coef",
    "av_pagerank",
    "pairwise_distance_per_sent",
    "graph_conncomp_per_sent",
    "clustering_coef_per_sent",
    "graph_density_per_sent",
    "graph_conncomp_per_para",
    "clustering_coef_per_para",
    "graph_density_per_para",
    "pairwise_distance_per_para",
    "pairwise_semrel_per_sent",
    "pairwise_semrel_per_para",
]

no_para = [
    "node_degree",
    "clustering_coef",
    "av_pagerank",
    "pairwise_distance_per_sent",
    "graph_conncomp_per_sent",
    "clustering_coef_per_sent",
    "graph_density_per_sent",
    "pairwise_semrel_per_sent",
]

best_features_brit = [
    "av_pagerank",
    "graph_conncomp_per_sent",
    "graph_conncomp_per_para",
]
best_features_new = [
    "av_pagerank",
    "graph_conncomp_per_sent",
    "clustering_coef_per_sent",
]

#### Classification on whole data

In [6]:
def class_whole_data(df, feat, print_features=False, cross_val=10):
    '''
    function that returns accuracy and variance of a classifier
    given the data frame and the features
    the considered features can be given with the 'feat' argument
    the classes are the different values under the "level" column of the DF
    it can optionally print the feature values of a classifier
    '''
    
    clf = RandomForestClassifier(n_estimators=100, max_depth=None)
    if print_features:
        X_train, X_test, y_train, y_test = train_test_split(
            df[feat], df["level"], test_size=0.1
        )
        X_train = X_train.apply(lambda x: (x - x.min()) / (x.max() - x.min()), axis=0)
        X_test = X_test.apply(lambda x: (x - x.min()) / (x.max() - x.min()), axis=0)
        clf.fit(X_train, y_train)
        results = clf.predict(X_test)
    
        print(sorted(list(zip(clf.feature_importances_, feat))))

    df[feat] = df[feat].apply(lambda x: (x - x.min()) / (x.max() - x.min()), axis=0)
    score = cross_val_score(clf, df[feat], df["level"], cv=cross_val)
    return np.mean(score), np.var(score)

In [7]:
print("Britannica all features: ", class_whole_data(df_brit, all_features))
print("Britannica best features: ", class_whole_data(df_brit, best_features_brit))
print("Newsela all features: ", class_whole_data(df_new, no_para))
print("Newsela best features: ", class_whole_data(df_new, best_features_new))

Britannica all features:  (0.74, 0.00617777777777778)
Britannica best features:  (0.7444444444444445, 0.008765432098765435)
Newsela all features:  (0.323, 0.018941)
Newsela best features:  (0.28700000000000003, 0.013581)


#### Pairwise classification of the same text

In [8]:
def create_pairs_of_text(X, y, level_no):
    '''
    function that creates pairs of texts from a data frame with the topic being controlled
    the number of levels has to be given, 5 with Newsela and 3 with Britannica
    e.g. the pairs in a Britannica data set will be kid vs student, student vs scholar and kid vs scholar 
    for the topic chicken etc.
    the order is randomized (so sometimes the more complex text is first in the pair and other times 
    the simpler one)
    '''
    pairs = []
    sub_pairs = []
    X = X.values.tolist()
    to_sort = list(zip(X, y))
    for ind, lst in enumerate(to_sort):
        if ind % level_no == 0 and ind != 0:
            for pair in list(combinations(sub_pairs, 2)):
                pairs.append(pair)
            sub_pairs = []
            sub_pairs.append(lst)
        else:
            sub_pairs.append(lst)

    return [random.sample(list(tup), 2) for tup in pairs]


In [29]:
def class_pairwise(
    df_train,
    df_test,
    feat,
    levels_train,
    levels_test,
    print_features=False,
    cross_val=10):
    '''
    function that does a binary classification of data
    it works with pairs created by the create_pairs_of_text function
    can be used with one data set and also with different training and test sets
    arguments:
    df_train:       training data
    df_test:        test data, can be same or different as df_train
    feat:           list of features to be considered for classification
    levels_train:   number of levels in training data, 5 for newsela, 3 for britannica
    levels_test:    number of levels in test data
    print_features: if True, will print sorted list of feature values
    cross_val:      number of folds for cross-validation
    '''
    
    try:
        if df_train.all == df_test.all:

                clf = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=0, bootstrap=True)
                
                pairs = create_pairs_of_text(df_train[feat], df_train['level'], levels_train)
                labels = [0 if pair[0][1] < pair[1][1] else 1 for pair in pairs]
                pairs = [
                    [item for sublist in list((tup[0][0], tup[1][0])) for item in sublist]
                    for tup in pairs]
                
                if print_features:
                    print(sorted(list(zip(clf.feature_importances_, feat))))
                score = cross_val_score(clf, pairs, labels, cv=cross_val)

                return np.mean(score), np.var(score)
            
    
    except:
        accuracies = []

        for exp in range(cross_val):
           
                X_train = df_train[feat]
                y_train = df_train["level"]
                X_test = df_test[feat]
                y_test = df_test["level"]

                X_train = X_train.apply(lambda x: (x - x.min()) / (x.max() - x.min()), axis=0)
                X_test = X_test.apply(lambda x: (x - x.min()) / (x.max() - x.min()), axis=0)
                
                pairs_train = create_pairs_of_text(X_train, y_train, levels_train)
                labels_train = [0 if pair[0][1] < pair[1][1] else 1 for pair in pairs_train]
                pairs_train = [
                    [item for sublist in list((tup[0][0], tup[1][0])) for item in sublist]
                    for tup in pairs_train
                ]

                pairs_test = create_pairs_of_text(X_test, y_test, levels_test)
                labels_test = [0 if pair[0][1] < pair[1][1] else 1 for pair in pairs_test]
                pairs_test = [
                    [item for sublist in list((tup[0][0], tup[1][0])) for item in sublist]
                    for tup in pairs_test
                ]

                clf = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=0, bootstrap=True)
                clf.fit(pairs_train, labels_train)
                results = clf.predict(pairs_test)
                accuracies.append(accuracy_score(results, labels_test))

                #print(accuracies)
                if print_features:
                    print(sorted(list(zip(clf.feature_importances_, feat))))

                return np.mean(accuracies), np.var(accuracies, dtype=np.float64)

In [30]:
print(
    "Britannica all features: ",
    class_pairwise(
        df_brit, df_brit, all_features, 3, 3, print_features=False, cross_val=10
    ),
)
print(
    "Britannica best features: ",
    class_pairwise(
        df_brit, df_brit, best_features_brit, 3, 3, print_features=False, cross_val=10
    ),
)
print(
    "Newsela all features: ",
    class_pairwise(df_new, df_new, no_para, 5, 5, print_features=False, cross_val=10),
)
print(
    "Newsela best features: ",
    class_pairwise(
        df_new, df_new, best_features_new, 5, 5, print_features=False, cross_val=10
    ),
)
print(
    "Newsela-Britannica all features: ",
    class_pairwise(df_new, df_brit, no_para, 5, 3, print_features=False, cross_val=10),
)
print(
    "Newsela-Britannica best features: ",
    class_pairwise(
        df_new, df_brit, best_features_new, 5, 3, print_features=False, cross_val=10
    ),
)

Britannica all features:  (0.8653186274509803, 0.008018351982677602)
Britannica best features:  (0.8798611111111111, 0.007600790895061727)
Newsela all features:  (0.7937579056900665, 0.04246451023808745)
Newsela best features:  (0.8023817572712046, 0.03363391713047783)
Newsela-Britannica all features:  (0.8448275862068966, 0.0)
Newsela-Britannica best features:  (0.7873563218390804, 0.0)


#### Pairwise classification between all texts

In [31]:
def create_all_pairs(X, y):
     '''
    function that creates pairs of texts (i.e. features values) from a data frame 
    the order is randomized (so sometimes the more complex text is first in the pair and other times 
    the simpler one)
    '''
    pairs = []
    X = X.values.tolist()

    pairs = list(combinations(list(zip(X, y)), 2))

    return [random.sample(list(tup), 2) for tup in pairs]

In [56]:
def class_pairwise_all(df_train, df_test, feat, print_features=False, cross_val=10):
    
     '''
    function that does a binary classification of data
    it works with pairs created by the create_all_pairs function
    can be used with one data set and also with different training and test sets
    arguments:
    df_train:       training data
    df_test:        test data, can be same or different as df_train
    feat:           list of features to be considered for classification
    print_features: if True, will print sorted list of feature values
    cross_val:      number of folds for cross-validation
    '''
    
    
    accuracies = []

    try:
            if df_train.all == df_test.all:
                                               
                clf = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=0, bootstrap=True)
                pairs = create_all_pairs(df_train[feat], df_train['level'])
                labels = [0 if pair[0][1] < pair[1][1] else 1 for pair in pairs]
                pairs = [
                    [item for sublist in list((tup[0][0], tup[1][0])) for item in sublist]
                    for tup in pairs
                ]

                if print_features:
                    print(sorted(list(zip(clf.feature_importances_, feat))))
                score = cross_val_score(clf, pairs, labels, cv=cross_val)
            
                return np.mean(score), np.var(score)
 
                
    except:
        
        for exp in range(cross_val):

        
            X_train = df_train[feat]
            y_train = df_train["level"]
            X_test = df_test[feat]
            y_test = df_test["level"]

            X_train = X_train.apply(lambda x: (x - x.min()) / (x.max() - x.min()), axis=0)
            X_test = X_test.apply(lambda x: (x - x.min()) / (x.max() - x.min()), axis=0)

            pairs_train = create_all_pairs(X_train, y_train)
            labels_train = [0 if pair[0][1] < pair[1][1] else 1 for pair in pairs_train]
            pairs_train = [
                [item for sublist in list((tup[0][0], tup[1][0])) for item in sublist]
                for tup in pairs_train
            ]

            pairs_test = create_all_pairs(X_test, y_test)
            labels_test = [0 if pair[0][1] < pair[1][1] else 1 for pair in pairs_test]
            pairs_test = [
                [item for sublist in list((tup[0][0], tup[1][0])) for item in sublist]
                for tup in pairs_test
            ]


            clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
            clf.fit(pairs_train, labels_train)
            results = clf.predict(pairs_test)

            accuracies.append(accuracy_score(results, labels_test))

            if print_features:
                print(sorted(list(zip(clf.feature_importances_, feat))))

            return np.mean(accuracies), np.var(accuracies)

In [57]:
print(
    "Britannica all features: ",
    class_pairwise_all(
        df_brit, df_brit, all_features, print_features=False, cross_val=10
    ),
)
print(
    "Britannica best features: ",
    class_pairwise_all(
        df_brit, df_brit, best_features_brit, print_features=False, cross_val=10
    ),
)
print(
    "Newsela all features: ",
    class_pairwise_all(df_new, df_new, no_para, print_features=False, cross_val=10),
)
print(
    "Newsela best features: ",
    class_pairwise_all(
        df_new, df_new, best_features_new, print_features=False, cross_val=10
    ),
)
print(
    "Newsela-Britannica all features: ",
    class_pairwise_all(df_new, df_brit, no_para, print_features=False, cross_val=10),
)
print(
    "Newsela-Britannica best features: ",
    class_pairwise_all(
        df_new, df_brit, best_features_new, print_features=False, cross_val=10
    ),
)

Britannica all features:  (0.8808595707486129, 0.006089433839461934)
Britannica best features:  (0.8761713975852029, 0.005529062640828577)
Newsela all features:  (0.8749146084446311, 0.0035003827400735496)
Newsela best features:  (0.8470787887992092, 0.003261821302307271)
Newsela-Britannica all features:  (0.7528890600924499, 0.0)
Newsela-Britannica best features:  (0.7791474062660503, 0.0)


#### Outlier removal

In [54]:
print(
    "No outlier removal: ",
    class_pairwise_all(
        short_brit, short_brit, all_features, print_features=False, cross_val=10
    ),
)
print(
    "Sematch: ",
    class_pairwise_all(
       sematch , sematch, all_features, print_features=False, cross_val=10
    ),
)
print(
    "Exclusivity-based semantic relatedness: ",
    class_pairwise_all(
        semrel, semrel, all_features, print_features=False, cross_val=10
    ),
)

No outlier removal:  (0.87518820861678, 0.005816175480398116)
Sematch:  (0.9582321493152193, 0.00182396167879558)
Exclusivity-based semantic relatedness:  (0.926944667201283, 0.0033143152226128595)
