In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools as it
import numpy as np

In [2]:
"""
- Avg_words_per_sentence 
- Avg_syllables_per_word 
- Complex_word_percent   
- Difficult_word_percent 
- Long_sent_percent      
- Long_word_percent      
- Avg_letters_per_word   
- Comma_percent          
- Proper_noun_percent    
- Noun_percent           
- Pronoun_percent        
- Conj_percent           

- Tokens            
- Words             
- Sentences         
- N_words           
- N_sentences       
- N_syllables       
- N_polysyllables   
"""
import pandas as pd
import spacy
import pyphen
import benepar

SPACY_MODEL = "en_core_web_sm"
import en_core_web_sm

def _get_words(x):
    words = [token.text for token in x if token.is_punct != True]
    return words

def words_and_sentences(df):
    nlp = spacy.load('en_core_web_sm', exclude=['parser', 'ner'])
    nlp.add_pipe('sentencizer')    
    df['Tokens'] = df['Text'].apply(lambda x: nlp(x))    
    df['Words'] = df['Tokens'].apply(_get_words)    
    df['Sentences'] = df['Tokens'].apply(lambda x: list(x.sents))    
    df['N_words'] = df['Words'].apply(lambda x: len(x))    
    df['N_sentences'] = df['Sentences'].apply(lambda x: len(x))    
    df["Avg_words_per_sentence"] = df["N_words"] / df["N_sentences"]    
    return df

def _count_hyphens(text, dic):
    return dic.inserted(text).count("-")

def syllables(df):
    dic = pyphen.Pyphen(lang='en_EN')
    df["N_hyphens"] = df["Text"].apply(lambda x: _count_hyphens(x, dic))
    df["N_syllables"] = df["N_words"] + df["N_hyphens"]
    df["Avg_syllables_per_word"] = df["N_syllables"] / df["N_words"]
    df.drop(columns=["N_hyphens"], inplace=True)
    return df

def _get_dale_chall_easy_words():
    easy_words = set()
    with open("dale_chall_easy_word_list.txt") as file:
        lines = [line.rstrip('\n') for line in file]
        for line in lines:
            easy_words.add(line.lower())
    return easy_words


def _get_num_difficult_words(text, easy_words):
    n = 0
    for word in text:
        if word.lower() not in easy_words:
            n += 1
    return n


def difficult_words_pct(df):    
    easy_words = _get_dale_chall_easy_words()    
    df["Difficult_word_percent"] = df["Words"].apply(lambda x: _get_num_difficult_words(x, easy_words)) / df["N_words"]    
    return df


# POLYSYLLABLES (WORDS WITH 3 OR MORE SYLLABLES)

def _count_polysyllables(words, dic):
    n_complex = 0    
    for word in words:
        # if the word has more than 3 or more syllables it will have 2 or more hyphens
        if dic.inserted(word).count("-") >= 2:
            n_complex += 1    
    return n_complex


def polysyllables(df):   
    dic = pyphen.Pyphen(lang='en_EN')
    # use pyphen to find the number of polysyllables
    df["N_polysyllables"] = df["Words"].apply(lambda x: _count_polysyllables(x, dic))    
    return df

# PERCENTAGE OF COMPLEX WORDS (GUNNING FOG)

def complex_words_pct(df):   
    df["Complex_word_percent"] = df["N_polysyllables"] / df["N_words"]    
    return df


# PERCENTAGE OF LONG SENTENCES (LONGER THAN 25 WORDS)

def _get_n_long_sent(sentences):
    n = 0
    for sentence in sentences:
        if len(sentence) > 25:
            n += 1
    return n


def long_sent_pct(df):   
    df["Long_sent_percent"] = df["Sentences"].apply(_get_n_long_sent) / df["N_sentences"]    
    return df


# PERCENTAGE OF LONG WORDS (LONGER THAN 8 CHARACTERS)
def _get_n_long_word(words):
    n = 0
    for word in words:
        if len(word) > 8:
            n += 1
    return n


def long_word_pct(df):   
    # get percentage
    df["Long_word_percent"] = df["Words"].apply(_get_n_long_word) / df["N_words"]    
    return df


def _get_n_letters(words):
    n = 0
    for word in words:
        n += len(word)
    return n


def avg_letters_per_word(df):   
    df["Avg_letters_per_word"] = df["Words"].apply(_get_n_letters) / df["N_words"]    
    return df


def _get_n_comma_sent(sentences):
    n = 0
    for sentence in sentences:
        if str(sentence).find(",") != -1:
            n += 1
    return n


def comma_pct(df):   
    # get percentage
    df["Comma_percent"] = df["Sentences"].apply(_get_n_comma_sent) / df["N_sentences"]    
    return df


def _get_n_pos(tokens, pos_list):
    n = 0
    for token in tokens:
        for pos in pos_list:
            if token.pos_ == pos:
                n += 1
    return n


def pos_features(df):    
    pos_list = ["NOUN", "PROPN"]
    df["Noun_percent"] = df["Tokens"].apply(lambda x: _get_n_pos(x, pos_list)) / df["N_words"]    
    pos_list = ["PROPN"]
    df["Proper_noun_percent"] = df["Tokens"].apply(lambda x: _get_n_pos(x, pos_list))/ df["N_words"]    
    pos_list = ["PRON"]
    df["Pronoun_percent"] = df["Tokens"].apply(lambda x: _get_n_pos(x, pos_list)) / df["N_words"]    
    pos_list = ["CONJ", "CCONJ"]
    df["Conj_percent"] = df["Tokens"].apply(lambda x: _get_n_pos(x, pos_list)) / df["N_words"]    
    return df


def remove_aux_features(df):
   
    df.drop(columns=["Tokens", "Words", "Sentences", "N_words", "N_sentences", "N_syllables", "N_polysyllables"], inplace=True)
    
    return df

In [3]:
"""
- NP_per_sent
- VP_per_sent
- PP_per_sent
- SBAR_per_sent
- SBARQ_per_sent
- avg_NP_size
- avg_VP_size
- avg_PP_size
- avg_parse_tree

"""
from collections import Counter, defaultdict
import pandas as pd
import spacy
import nltk
import benepar
from benepar import BeneparComponent, NonConstituentException
benepar.download('benepar_en3')

def _parse_tree_height(sent):
    
    children = list(sent._.children)
    if not children:
        return 0
    else:
        return max(_parse_tree_height(child) for child in children) + 1


def _get_constituents(tokens):
    const_counter = Counter()
    const_lengths = defaultdict(list)

    for sentence in tokens.sents:
        for const in sentence._.constituents:
            # add constituent to constituent counter
            const_counter.update(Counter(const._.labels))
            
            # append the length of the constituent
            for label in const._.labels:
                const_lengths[label].append(len(const))
    
    # for each constituent, get average of constituent's lengths
    const_avgs = defaultdict(int)
    for key in const_lengths.keys():
        avg = 0.0
        for length in const_lengths[key]: 
            avg += length
        avg /= len(const_lengths[key])
        
        const_avgs[key] = avg
         
    return const_counter, const_avgs


def _get_parse_tree_height(tokens):
   
    avg_parse_tree_height = 0.0
    
    for sentence in tokens.sents:
        avg_parse_tree_height += _parse_tree_height(sentence)
        
    n_sentences = len(list(tokens.sents))
    avg_parse_tree_height /= n_sentences
    
    return avg_parse_tree_height, n_sentences


def _get_parse_tree_features(tokens):
    const_counter, const_avgs = _get_constituents(tokens)
    avg_parse_tree_height, n_sentences = _get_parse_tree_height(tokens)
    
    NP_per_sent = const_counter['NP'] / n_sentences
    VP_per_sent = const_counter['VP'] / n_sentences
    PP_per_sent = const_counter['PP'] / n_sentences
    SBAR_per_sent = const_counter['SBAR'] / n_sentences
    SBARQ_per_sent = const_counter['SBARQ'] / n_sentences
    avg_NP_size = const_avgs['NP']
    avg_VP_size = const_avgs['VP']
    avg_PP_size = const_avgs['PP']
    avg_parse_tree = avg_parse_tree_height
    
    return NP_per_sent, VP_per_sent, PP_per_sent, \
        SBAR_per_sent, SBARQ_per_sent, avg_NP_size, \
        avg_VP_size, avg_PP_size, avg_parse_tree
    

def parse_tree_features(df):
    nlp = en_core_web_sm.load(disable=['ner'])
    if spacy.__version__.startswith('2'):
        nlp.add_pipe(benepar.BeneparComponent("benepar_en3"))
    else:
        nlp.add_pipe("benepar", config={"model": "benepar_en3"})
    # parse text
    df['B_Tokens'] = df['Text'].apply(lambda x: nlp(x))
    
    # get features
    df['NP_per_sent'], df['VP_per_sent'], df['PP_per_sent'], \
    df['SBAR_per_sent'], df['SBARQ_per_sent'], df['avg_NP_size'], \
    df['avg_VP_size'], df['avg_PP_size'], df['avg_parse_tree'] = zip(*df['B_Tokens'].map(_get_parse_tree_features))
    
    # remove B_Tokens
    df.drop(columns=["B_Tokens"], inplace=True)
    
    return df

[nltk_data] Downloading package benepar_en3 to
[nltk_data]     e:\Anaconda\nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!


In [11]:
datasets=['CEFR','CLEC','CLOTH','NES','OSP','RACE']

for data in datasets:
    df = pd.read_csv("../L2/data/"+str(data)+".csv", index_col = 0)
    df['Text'] = df['Text'].astype(str)
    df = words_and_sentences(df)
    df = syllables(df)dd
    df = difficult_words_pct(df)
    df = polysyllables(df)
    df = complex_words_pct(df)
    df = long_sent_pct(df)
    df = long_word_pct(df)
    df = avg_letters_per_word(df)
    df = comma_pct(df)
    df = pos_features(df)
    df = remove_aux_features(df)
    df = parse_tree_features(df)
    df.to_csv("../L2/feature/"+str(data)+"_with_features.csv", encoding='utf-8')

