### 1. Extract training split using train-test-split.csv

In [2]:
import os
import numpy as np
import pandas as pd

base = os.path.dirname(os.path.dirname(os.path.abspath("Preliminary_Statistics.ipynb")))
split_file_name = base + "\data\ArgumentAnnotatedEssays-2.0\\train-test-split.csv"
unified_data_file = base + '/data/unified_data_file.json'


# convert csv file into DataFrame
train_test_df = pd.read_csv(split_file_name, ';')

# convert json into DataFrame
unified_data_df = pd.read_json(unified_data_file)

# add column "SET" from train_test_df to unified_data_df
unified_data_df.insert(7,"SET",train_test_df.SET)


# get training split from unified_data_df
train_df = unified_data_df[unified_data_df.SET=="TRAIN"]


###  2. Number of essays, paragraphs, sentences and tokens

In [3]:
import spacy

# Computer number of essays
num_of_essays = train_df.shape[0]
print(f"Number of essays = {num_of_essays}")

# Computer number of paragraphs
num_pg_each_essay = train_df.text.apply(lambda x: len(x.split('\n'))-2)
num_of_pg = num_pg_each_essay.sum()
print(f"Number of paragraphs = {num_of_pg}")


# Computer number of sentences
nlp = spacy.load("en_core_web_sm")

def num_of_sents(text: str) -> int:
    parsed_essay = nlp(text.replace("\n"," "))    
    sents = []
    for sentence in parsed_essay.sents:
        sents.append(sentence.text)
    first_sen = sents[0]
    sents.remove(first_sen)
    return len(sents)

num_sents_each_essay = train_df.text.apply(num_of_sents)
num_sents = num_sents_each_essay.sum()
print(f"Number of sentences = {num_sents}")

# Computer number of tokens
def num_of_tokens(text: str) -> int:
    
    #remove topic, just remain content of each essay
    text_list = text.split("\n")
    topic = text_list[0]
    text_list.remove(topic)
    content = "\n".join(text_list)
    
    parsed_essay = nlp(content.strip())
    tokens = []
    for token in parsed_essay:
        tokens.append(token.text)
    return len(tokens)

num_tokens_each_essay = train_df.text.apply(num_of_tokens)
num_tokens = num_tokens_each_essay.sum()
print(f"Number of tokens = {num_tokens}")

Number of essays = 322
Number of paragraphs = 1464
Number of sentences = 5237
Number of tokens = 116501


### 3. Number of major claims, claims, premises

In [4]:
# Computer number of major claims
mc_in_each_essay = train_df.major_claim.apply(lambda x: len(x))
num_of_major_claims = mc_in_each_essay.sum()
print(f"Number of Major Claims = {num_of_major_claims}")

# Computer number of claims
c_in_each_essay = train_df.claims.apply(lambda x: len(x))
num_of_claims = c_in_each_essay.sum()
print(f"Number of Claims = {num_of_claims}")

# Computer number of premises
pre_in_each_essay = train_df.premises.apply(lambda x: len(x))
num_of_premises = pre_in_each_essay.sum()
print(f"Number of Premises = {num_of_premises}")

Number of Major Claims = 598
Number of Claims = 1202
Number of Premises = 3023


### 4. Number of essays with and without confirmation bias

In [5]:
# Compute number of essays with confirmation bias
with_bias = train_df.confirmation_bias.apply(lambda x: 1 if(x) else 0)
num_of_essays_with_bias = with_bias.sum()
print(f"Number of essays with confirmation bias = {num_of_essays_with_bias}") 

# Compute number of essays with confirmation bias
without_bias = train_df.confirmation_bias.apply(lambda x: 0 if(x) else 1)
num_of_essays_without_bias = without_bias.sum()
print(f"Number of essays without confirmation bias = {num_of_essays_without_bias}") 

Number of essays with confirmation bias = 122
Number of essays without confirmation bias = 200


### 5. Number of sufficient and insufficient paragraphs (arguments)

In [6]:
# Computer number of sufficient paragraphs
def count_suffi_pg(pgs: list) -> int:
    num = 0
    for pg in pgs:
        if (pg["sufficient"]):
            num = num +1
    return num
    
    
suffi_pg_each_essay = train_df.paragraphs.apply(count_suffi_pg)
num_suffi_pgs = suffi_pg_each_essay.sum()
print(f"Numer of sufficient paragraphs = {num_suffi_pgs}")

# Computer number of insufficient paragraphs
def count_insuffi_pg(pgs: list) -> int:
    num = 0
    for pg in pgs:
        if (not pg["sufficient"]):
            num = num +1
    return num
    
    
insuffi_pg_each_essay = train_df.paragraphs.apply(count_insuffi_pg)
num_insuffi_pgs = insuffi_pg_each_essay.sum()
print(f"Numer of insufficient paragraphs = {num_insuffi_pgs}")

Numer of sufficient paragraphs = 538
Numer of insufficient paragraphs = 282


### 6. Average number of tokens in major claims, claims and premises

In [7]:
# Computer number of tokens of a text in major_claims, claims, and premises
def num_text_tokens(alist: list) -> int:
    num = 0
    for element in alist:
        parsed_element = nlp(element["text"])
        tokens = []
        for token in parsed_element:
            tokens.append(token.text)
        num = num + len(tokens)
    return num

# Computer average number of tokens in major claims
num_mc_each_essay = train_df.major_claim.apply(num_text_tokens)
num_tokens_major_claims = num_mc_each_essay.sum()
print(f"Average number of tokens in major claims = {num_tokens_major_claims/num_of_major_claims}")

# Computer average number of tokens in claims
num_claims_each_essay = train_df.claims.apply(num_text_tokens)
num_tokens_claims = num_claims_each_essay.sum()
print(f"Average number of tokens in claims = {num_tokens_claims/num_of_claims}")

# Computer average number of tokens in premises
num_premises_each_essay = train_df.premises.apply(num_text_tokens)
num_tokens_premises = num_premises_each_essay.sum()
print(f"Average number of tokens in premises = {num_tokens_premises/num_of_premises}")

Average number of tokens in major claims = 14.695652173913043
Average number of tokens in claims = 15.089850249584027
Average number of tokens in premises = 17.601389348329473


### 7. The 10 most specific words in major claims, claims, and premises

In [12]:
import random

def count_words(text: str) -> set:
    words_set = set([])
    parsed_text = nlp(text)
    for token in parsed_text:
        if(not token.is_punct and not token.is_space and not token.is_stop):
            words_set.add(token.text.lower())
    return words_set

# Compute the key-words in essay topics
topics_words_set_list = []
for text in train_df.text:
    text_list = text.split("\n")
    topic = text_list[0]
    topics_words_set = count_words(topic)
    topics_words_set_list.append(topics_words_set)

specific_words_topics = set([])
for words_set in topics_words_set_list:
    specific_words_topics = specific_words_topics ^ words_set
    

# Compute the 10 most specific words in major claims
major_claims_words_set_list = []
for mc_list in train_df.major_claim:
    for mc in mc_list:
        mc_words_set = count_words(mc["text"])
        major_claims_words_set_list.append(mc_words_set)

specific_words_mc = set([])
for words_set in major_claims_words_set_list:
    specific_words_mc = specific_words_mc ^ words_set

ten_spec_words_mc =  []
mc_candidates = list(specific_words_mc & specific_words_topics)

for i in range(10):
    candidate = random.choice(mc_candidates)
    mc_candidates.remove(candidate)
    ten_spec_words_mc.append(candidate)
    
print("10 most specific words in major claims: ")
print(f"{ten_spec_words_mc}")
print()


# Compute the 10 most specific words in claims
claims_words_set_list = []
for claim_list in train_df.claims:
    for claim in claim_list:
        claims_words_set = count_words(claim["text"])
        claims_words_set_list.append(claims_words_set)


specific_words_claims = set([])
for words_set in claims_words_set_list:
    specific_words_claims = specific_words_claims ^ words_set

ten_spec_words_claim = []
claim_candidates = list(specific_words_claims & specific_words_topics)
for i in range(10):
    candidate = random.choice(claim_candidates)
    claim_candidates.remove(candidate)
    ten_spec_words_claim.append(candidate)
    
print("10 most specific words in claims: ")
print(f"{ten_spec_words_claim}")
print()

# Compute the 10 most specific words in premises
premises_words_set_list = []
for premises_list in train_df.premises:
    for premise in premises_list:
        premises_words_set = count_words(premise["text"])
        premises_words_set_list.append(premises_words_set)


specific_words_premises = set([])
for words_set in premises_words_set_list:
    specific_words_premises = specific_words_premises ^ words_set

ten_spec_words_premise = []
premise_candidates = list(specific_words_premises & specific_words_topics)
for i in range(10):
    candidate = random.choice(premise_candidates)
    premise_candidates.remove(candidate)
    ten_spec_words_premise.append(candidate)
    
print("10 most specific words in premises: ")
print(f"{ten_spec_words_premise}")


10 most specific words in major claims: 
['services', 'listening', 'method', 'rural', 'offer', 'plan', 'cultures', 'favor', 'adults', 'academic']

10 most specific words in claims: 
['issue', 'females', 'technologies', 'examination', 'destroy', 'suitable', 'increases', 'popular', 'powerful', 'touch']

10 most specific words in premises: 
['know', 'concerned', 'adverse', 'reduced', 'change', 'tv', 'properly', 'employ', 'true', 'supporting']
