In [1]:
import pandas as pd
from transformers import DataCollatorWithPadding, AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import torch
from torch.nn.functional import softmax
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder

import nltk
nltk.download('punkt')
from nltk import pos_tag
from nltk.tokenize import word_tokenize
nltk.download('averaged_perceptron_tagger_eng')
nltk.download("punkt")


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\urasa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\urasa\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\urasa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
csv_path = "..\\article_scraping\\gemini_labelled_football_articles.csv"

def fix_csv_quotes(file_path, output_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    fixed_lines = []
    fixed_lines.append(lines[0].strip() + '\n')

    for line in lines[1:]:
        columns = line.split('","')
        fixed_columns = []
        for column in columns:
            fixed_column = column.replace('"', '')
            fixed_columns.append(fixed_column)
        
        fixed_line = '","'.join(fixed_columns).strip()
        fixed_line = f'"{fixed_line}"'
        
        fixed_lines.append(fixed_line)
    
    with open(output_path, 'w', encoding='utf-8') as f:
        for fixed_line in fixed_lines:
            f.write(fixed_line + '\n')

In [2]:
fix_csv_quotes(csv_path, "validation_sentences.csv")

In [2]:
df = pd.read_csv("fixed_gemini_labelled_articles.csv")

In [72]:
# Class distribution: Arguments vs Non-Arguments
class_distribution = df['label'].value_counts(normalize=True)

# Sentence length analysis
df['sentence_length'] = df['sentence'].apply(len)
sentence_length_stats = df.groupby('label')['sentence_length'].describe()

# Top article topics by the number of arguments
top_topics_by_arguments = df[df['label'] == '1']['article_topic'].value_counts().head(10)

# Argument density per topic (ratio of arguments to non-arguments within each topic)
argument_density = df.groupby('article_topic')['label'].apply(lambda x: (x == '1').mean()).sort_values(ascending=False).head(10)

# Keyword analysis (frequent words in arguments vs non-arguments), Arguments
vectorizer_args = CountVectorizer(stop_words='english', max_features=20)
X_args = vectorizer_args.fit_transform(df[df['label'] == '1']['sentence'])
keywords_args = pd.DataFrame(X_args.toarray(), columns=vectorizer_args.get_feature_names_out()).sum().sort_values(ascending=False)

# Non-arguments
vectorizer_non_args = CountVectorizer(stop_words='english', max_features=20)
X_non_args = vectorizer_non_args.fit_transform(df[df['label'] == '0']['sentence'])
keywords_non_args = pd.DataFrame(X_non_args.toarray(), columns=vectorizer_non_args.get_feature_names_out()).sum().sort_values(ascending=False)

class_distribution, sentence_length_stats, top_topics_by_arguments, argument_density, keywords_args, keywords_non_args


(label
 0     0.631675
 1     0.368186
 ,0    0.000140
 Name: proportion, dtype: float64,
         count        mean        std   min   25%    50%    75%    max
 label                                                                
 ,0        1.0   58.000000        NaN  58.0  58.0   58.0   58.0   58.0
 0      4519.0   99.712768  61.842954   6.0  48.0   89.0  138.0  425.0
 1      2634.0  127.555429  62.358228  17.0  79.0  118.0  166.0  440.0,
 article_topic
 PL hits and misses: Palmer makes history; Arsenal become title favourites    53
 Man Utd ate Crystal Palace alive in stalemate; says Ten Hag                  46
 Four-goal Palmer best in PL; says Maresca after Chelsea beat Brighton        37
 Hits and misses: Jackson repaying Maresca's faith                            36
 Hits and misses: Liverpool serious contenders as City show flaws             35
 Merson Says: Man City now fully aware of threat posed by Arsenal             35
 'Stick to the plan' - Ten Hag interview in full ahea

In [3]:
### WHAT KIND OF FEATURES WOULD I LIKE TO INCLUDE FOR BETTER IDENTIFICATION

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=50)
argument_df = df[df['label'] == '1']
tfidf_matrix = tfidf_vectorizer.fit_transform(argument_df['sentence'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [4]:
tfidf_df.drop(columns=['arsenal', 'united', 'chelsea', 'city', 'club', 'don', 'football', 'game', 'games', 'goals', 'hag', 'half', 'league', 'liverpool', 'manchester', 'man', 'minutes', 'player', 'players', 'premier', 'season', 'second', 'team', 'time', 'ball', 'manager', 'goal', ], inplace=True)
tfidf_df.columns

Index(['best', 'better', 'big', 'did', 'going', 'good', 'just', 'know', 'like',
       'lot', 'make', 'need', 'new', 'play', 'really', 'right', 'said',
       'start', 'think', 've', 'want', 'way', 'win'],
      dtype='object')

In [5]:
filtered_keywords = tfidf_df.columns.tolist()  
tfidf_all_sentences = tfidf_vectorizer.transform(df['sentence'])
tfidf_all_df = pd.DataFrame(tfidf_all_sentences.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_all_filtered = tfidf_all_df[filtered_keywords]

In [6]:
df = pd.concat([df.reset_index(drop=True), tfidf_all_filtered.reset_index(drop=True)], axis=1)

In [131]:
"""
Nouns and Proper Nouns:

NN: Noun, singular (e.g., "dog")
NNS: Noun, plural (e.g., "dogs")
NNP: Proper noun, singular (e.g., "John")
NNPS: Proper noun, plural (e.g., "Americans")
Verbs:

VB: Base form (e.g., "run")
VBD: Past tense (e.g., "ran")
VBG: Gerund or present participle (e.g., "running")
VBN: Past participle (e.g., "driven")
VBP: Present tense, not 3rd person singular (e.g., "run" in "they run")
VBZ: Present tense, 3rd person singular (e.g., "runs")
Adjectives and Comparatives:

JJ: Adjective (e.g., "big")
JJR: Adjective, comparative (e.g., "bigger")
JJS: Adjective, superlative (e.g., "biggest")
Adverbs:

RB: Adverb (e.g., "quickly")
RBR: Adverb, comparative (e.g., "faster")
RBS: Adverb, superlative (e.g., "fastest")
Pronouns and Determiners:

PRP: Personal pronoun (e.g., "he", "they")
PRP$: Possessive pronoun (e.g., "his", "her")
DT: Determiner (e.g., "the", "a")
WP: Wh-pronoun (e.g., "who", "what")
WP$: Possessive wh-pronoun (e.g., "whose")
WDT: Wh-determiner (e.g., "which")
Prepositions and Conjunctions:

IN: Preposition or subordinating conjunction (e.g., "in", "because")
CC: Coordinating conjunction (e.g., "and", "or")
TO: "to" (as in "to go")
Others:

MD: Modal verb (e.g., "can", "will")
POS: Possessive ending (e.g., "'s")
RP: Particle (e.g., "up" in "give up")
EX: Existential "there" (e.g., "there is")
FW: Foreign word (e.g., words from other languages)
UH: Interjection (e.g., "uh", "well")
SYM: Symbol (e.g., "&", "%")
$: Dollar sign
Punctuation:

.: Sentence-ending punctuation (e.g., period, exclamation point)
,: Comma
:: Colon
( and ): Parentheses


"""

'\nNouns and Proper Nouns:\n\nNN: Noun, singular (e.g., "dog")\nNNS: Noun, plural (e.g., "dogs")\nNNP: Proper noun, singular (e.g., "John")\nNNPS: Proper noun, plural (e.g., "Americans")\nVerbs:\n\nVB: Base form (e.g., "run")\nVBD: Past tense (e.g., "ran")\nVBG: Gerund or present participle (e.g., "running")\nVBN: Past participle (e.g., "driven")\nVBP: Present tense, not 3rd person singular (e.g., "run" in "they run")\nVBZ: Present tense, 3rd person singular (e.g., "runs")\nAdjectives and Comparatives:\n\nJJ: Adjective (e.g., "big")\nJJR: Adjective, comparative (e.g., "bigger")\nJJS: Adjective, superlative (e.g., "biggest")\nAdverbs:\n\nRB: Adverb (e.g., "quickly")\nRBR: Adverb, comparative (e.g., "faster")\nRBS: Adverb, superlative (e.g., "fastest")\nPronouns and Determiners:\n\nPRP: Personal pronoun (e.g., "he", "they")\nPRP$: Possessive pronoun (e.g., "his", "her")\nDT: Determiner (e.g., "the", "a")\nWP: Wh-pronoun (e.g., "who", "what")\nWP$: Possessive wh-pronoun (e.g., "whose")\nW

In [7]:
grouped_tags = {
        'NOUN': {'NN', 'NNS', 'NNP', 'NNPS'},
        'VERB': {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'},
        'ADJ': {'JJ', 'JJR', 'JJS'},
        'ADV': {'RB', 'RBR', 'RBS'},
        'PRON': {'PRP', 'PRP$', 'WP', 'WP$'},
        'WDT': {'WDT'},
        'PREP': {'IN', 'TO'},
        'MODAL': {'MD'},
        'NUM': {'CD'},
    }

In [8]:
def pos_tag_distribution(text):
    tokens = word_tokenize(text)
    tags = [tag for word, tag in pos_tag(tokens)]
    grouped_counts = {group: 0 for group in grouped_tags.keys()}
    
    for tag in tags:
        for group, members in grouped_tags.items():
            if tag in members:
                grouped_counts[group] += 1
                break
    
        
    tag_counts = pd.Series(grouped_counts).div(len(tags))
    return tag_counts

pos_tags_df = df['sentence'].apply(pos_tag_distribution).fillna(0)
df = pd.concat([df.reset_index(drop=True), pos_tags_df.reset_index(drop=True)], axis=1)


In [10]:
len(df.columns)

35

In [104]:
"""
one_hot_encoder = OneHotEncoder()
topic_encoded = one_hot_encoder.fit_transform(df[['article_topic']]).toarray()
topic_encoded_df = pd.DataFrame(topic_encoded, columns=one_hot_encoder.get_feature_names_out(['article_topic']))
df = pd.concat([df, topic_encoded_df], axis=1)
"""

In [9]:
# mean of boolean values returned per article
argument_density_per_topic = df.groupby('article_topic')['label'].apply(lambda x: (x == '1').mean())
df['argument_density'] = df['article_topic'].map(argument_density_per_topic)

In [103]:
def print_argument_density(argument_density_per_topic):
    sorted_density = sorted(argument_density_per_topic.items(), key=lambda x: x[1], reverse=True)
    
    print("Article Topic - Argument Density:")
    for topic, density in sorted_density:
        print(f"{topic}: {density}")

print_argument_density(argument_density_per_topic)

"""
Could be interesting to look into, however I don't know how to make it more useful now
More opionated the article topic, higher the argument density
"""

Article Topic - Argument Density:
On-song Fulham inflict first loss of season on Newcastle: 0.7096774193548387
Man Utd ate Crystal Palace alive in stalemate; says Ten Hag: 0.7076923076923077
Brighton win five-goal thriller as Spurs suffer second-half collapse: 0.64
Neville: Arsenal the only team who can take on ominous Man City: 0.627906976744186
Ange: Brighton collapse worst defeat of my Spurs tenure: 0.62
Forest owner Marinakis banned for spitting near officials after Fulham loss: 0.6153846153846154
Son criticises fixture demands: We're not robots: 0.5961538461538461
Papers: Real Madrid 'regret' signing Mbappe: 0.5789473684210527
Wolves maintain full O'Neil support as club part ways with set-piece coach: 0.5789473684210527
Merson on Ange's trophy claims: 'I’ve got more chance of winning Strictly!': 0.5740740740740741
Everton blow another lead in draw at Leicester amid thunderstorm: 0.5714285714285714
Man City; PL both claim victory in ruling over commercial deals: 0.5714285714285714


In [130]:
"""
# adjusting the vocab file
import re

unique_words = set()

for sen in df['sentence']:
    tokens = sen.split()
    for token in tokens:
        unique_words.add(token)

with open("enhanced_vocab.txt", 'r', encoding='utf-8') as f:
    lines = f.readlines()

filtered_vocab_set = set(line.strip() for line in lines)

with open("enhanced_vocab.txt", 'a', encoding='utf-8') as f:
    for word in unique_words:
        filtered_word = re.sub(r"[.,\[\]()\"':;?/!&^@\\]", "", word.strip().lower())
        if filtered_word not in filtered_vocab_set:
            f.write(f"{filtered_word}\n") 
"""

In [12]:
# I don't think I can use a pre-trained model with a custom pre-trained tokenizer because not all the tokens the model is trained with will be present in the new vocabulary of the tokenizer

In [13]:
df.drop(columns=['argument_density', 'article_topic'], inplace=True)

In [15]:
df.head()

Unnamed: 0,sentence,label,best,better,big,did,going,good,just,know,...,win,NOUN,VERB,ADJ,ADV,PRON,WDT,PREP,MODAL,NUM
0,Bukayo Saka has left the England training camp...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.375,0.166667,0.041667,0.0,0.041667,0.0,0.166667,0.0,0.0
1,The Arsenal winger limped off during the secon...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.40625,0.0625,0.03125,0.03125,0.0,0.0,0.21875,0.03125,0.0
2,Bukayo would have been close but it would have...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.095238,0.285714,0.047619,0.047619,0.142857,0.0,0.095238,0.095238,0.0
3,He's a positive person and I expect him to be ...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.076923,0.230769,0.153846,0.0,0.230769,0.0,0.076923,0.0,0.0
4,Saka continues to be assessed by the Arsenal m...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.3,0.2,0.05,0.05,0.0,0.0,0.25,0.0,0.0


In [16]:
df.to_pickle('tokenized_data.pkl')