Univariate feature selection file
Corresponds with section 5 in paper
Author: Kyle Lane
University of Rochester

In [1]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import pandas as pd
import numpy as np

In [2]:
#cleaning up data, requires "dataframe.csv" to function
def read_data():
    
    data = pd.read_csv("dataframe.csv")
    data["dependencies"].fillna("", inplace=True)
    data["relevant_dependencies"].fillna("", inplace=True)
    data["dependencies"] = data["dependencies"].apply(lambda x: x.split("),"))
    data["relevant_dependencies"] = data["relevant_dependencies"].apply(lambda x: x.split("),"))
    data.drop('recommended_exclusion', axis=1, inplace=True)
    data.drop('sentence_id', axis=1, inplace=True)
    
    return data

#takes rows in dataframe and strips out everything we dont need. 
#we also create the dependency id's which will be used as 'words'
#this function is purposly bloated to allow for further development
def process_data(data):
    dataset = []
    feature_set = []
    typeCount = {}
    for i, (position, word, gerund, tags, dependencies, relDependencies, sentence) in data.iterrows():
        feature_set.append(gerund)
        feature_set.append(word)
        typeCount[gerund] = typeCount[gerund] + 1 if gerund in typeCount else 0   
        for dependency in relDependencies:
            if len(dependency.split())<3:
                continue
            id = dependency.split()[0]
            w1 = dependency.split()[1][1:-1].split("-")[0]
            p1 = dependency.split()[1][1:-1].split("-")[1]
            w2 = dependency.split()[2].split("-")[0]
            p2 = dependency.split()[2].split("-")[1]
            organized_dependency = (id, w1, p1, w2, p2)
            feature_set.append(organized_dependency)  
        dataset.append(feature_set)
        feature_set = []
    #print(typeCount) turn on to see ammount of each gerund type
    return dataset

#makes the binary feature array. each dependency is 0 unless present
def makeArray(item, dependencies):
    features = {"gerund": item[0]}
    features.update({f: 0 for f in dependencies})
    for feature in item[2:]:
        features[feature[0]] = 1
    return features

#counts num of each dependency, doubles as a set of dependencys if taken as keys
def count_identifiers(data):
    identifier_counts = {}
    for item in data:
        dependencies = item[2:]
        for dependency in dependencies:
            identifier = dependency[0]
            identifier_counts[identifier] = identifier_counts.get(identifier, 0) + 1
    return identifier_counts

In [3]:
data = read_data()
goodData = process_data(data)
dependencies = list(count_identifiers(goodData).keys())
dicts = [makeArray(s, dependencies) for s in goodData]

p_data = pd.DataFrame.from_dict(dicts)


In [4]:
#prints dependency counts for each type, very rough
grouped = p_data.groupby("gerund")
for name, group in grouped:
    print(f"Gerund: {name}")
    print(group.sum())

Gerund: acc-ing
gerund          acc-ingacc-ingacc-ingacc-ingacc-ingacc-ingacc-...
xcomp                                                         451
obj                                                          2516
advmod                                                        195
obl                                                          2246
advcl                                                         798
compound:prt                                                   52
conj                                                         1058
mark                                                           91
case                                                         1293
amod                                                          357
nmod                                                         1706
acl                                                          3067
punct                                                         436
ccomp                                                       

In [9]:
X = p_data.drop('gerund', axis=1)
y = p_data['gerund']

# Perform chi-squared test for feature selection
#referenced from https://scikit-learn.org/stable/modules/feature_selection.html 

le = LabelEncoder()
y_int = le.fit_transform(y)

selector = SelectKBest(score_func=chi2, k=10)
X_new = selector.fit_transform(X, y_int)

# Get the indices of the top k features
top_k_indices = selector.get_support(indices=True)

# Get the column names of the top k features
top_k_features = X.columns[top_k_indices]



In [6]:
# Print the top k features out of order
print("The top features are:")
print(top_k_features)

The top features are:
Index(['advcl', 'mark', 'case', 'amod', 'nmod', 'acl', 'det', 'nmod:poss',
       'nsubj', 'compound'],
      dtype='object')


In [10]:
# Print the chi-squared statistics for all features (not ordered)
print("Chi-squared statistics for all features:")
for i in range(X.shape[1]):
    score = selector.scores_[i]
    feature_name = f"{dependencies[i]}"
    print(f"{feature_name}: {score:.2f}")

Chi-squared statistics for all features:
xcomp: 327.40
obj: 1058.30
advmod: 315.03
obl: 48.73
advcl: 2611.13
compound:prt: 91.79
conj: 193.69
mark: 4334.05
case: 2992.72
amod: 2781.70
nmod: 9532.28
acl: 2064.59
punct: 569.79
ccomp: 41.60
cop: 110.76
det: 44933.12
nmod:poss: 52850.92
csubj: 311.49
cc: 421.41
nsubj: 1866.38
appos: 346.50
parataxis: 8.33
iobj: 24.15
acl:relcl: 378.59
aux:pass: 183.95
nsubj:pass: 333.74
fixed: 160.22
compound: 9459.36
cc:preconj: 20.29
nummod: 60.14
det:predet: 373.21
csubj:pass: 1.89
obl:tmod: 20.42
aux: 140.92
obl:npmod: 4.02
flat: 32.19
dep: 12.02
reparandum: 0.64
vocative: 1.29
expl: 29.81
nmod:npmod: 85.93
list: 50.85
nmod:tmod: 20.78
discourse: 2.43


In [8]:
# Compute the chi-squared statistic for each feature-label combination
chiArray = np.zeros((44,6))

for i in range(len(le.classes_)):
    label_mask = (y_int == i)
    chi2_vals, p_vals = chi2(X, label_mask)
    for j, chi2_val in enumerate(chi2_vals):
        chiArray[j][i] = chi2_val
        
columns = [str(i) for i in le.classes_]
index = [str(i) for i in dependencies]
chidf = pd.DataFrame(chiArray, columns=columns, index=index)
chidf.index.name = "Features"
chidf.columns.name = "Labels"
for col in chidf.columns:
    print(f"Top 5 rows with largest values in column {col}:")
    print(chidf.nlargest(5, col)[col])
    print("\n")

Top 5 rows with largest values in column acc-ing:
Features
compound    8379.175232
mark        1603.732941
acl         1264.725915
advcl        778.816682
det          652.933306
Name: acc-ing, dtype: float64


Top 5 rows with largest values in column det-ing:
Features
det      26024.344551
amod      1316.157234
nsubj      996.599992
case       966.853700
mark       761.655137
Name: det-ing, dtype: float64


Top 5 rows with largest values in column ing-of:
Features
det     16405.685957
nmod     7897.448314
case      956.065041
mark      790.801860
amod      779.965750
Name: ing-of, dtype: float64


Top 5 rows with largest values in column poss-ing:
Features
nmod:poss    43680.880244
case           576.840992
amod           402.474145
mark           320.846290
advcl          199.447756
Name: poss-ing, dtype: float64


Top 5 rows with largest values in column poss-ing-of:
Features
nmod:poss    8756.284384
nmod          641.731022
case           79.218021
mark           67.157359
advcl   