## Import the Libraries

In [1]:
import os
import re
import torch
import random
import difflib
import textwrap
import evaluate
import numpy as np 
import pandas as pd
import seaborn as sns 
import scipy.stats as stats
import matplotlib.pyplot as plt
from tqdm import tqdm
from evaluate import load
from bert_score import score
from collections import Counter
from rouge_score import rouge_scorer
from evaluate import load as load_metric
from IPython.display import display, HTML
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq, TrainingArguments, Trainer, TrainerCallback, AutoTokenizer, AutoModelForSeq2SeqLM, EarlyStoppingCallback

import nltk
nltk.download('punkt')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Yoga\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Set device to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


## Data Preparation

In [47]:
# Load the dataset
df = pd.read_csv("FM2023.csv")
df.index = df.index + 1
df.head()

Unnamed: 0,Name,Position,Age,ca,pa,Nationality,Club,Corners,Crossing,Dribbling,...,World reputation,Race,RCA,Colour of skin,Date of birth,Number of national team appearances,Goals scored for the national team,Salary,Rental club,UID
1,Kevin De Bruyne,M/AM RLC,31,189,189,Belgium,Manchester City,14,19,15,...,9400,Northern_European,181,4,1991/6/28,91,24,394372.0,,18004457
2,Kylian Mbappé,AM/S RL,23,188,197,France,Paris Saint-Germain,13,13,18,...,9248,African_Caribbean,172,13,1998/12/20,57,27,1035616.0,,85139014
3,Robert Lewandowski,S,33,186,190,Poland,Barcelona,3,8,13,...,9250,Northern_European,183,3,1988/8/21,132,76,345204.0,,719601
4,Erling Haaland,S,22,185,195,"Norway,England",Manchester City,7,10,14,...,8750,Northern_European,185,2,2000/7/21,21,20,394372.0,,29179241
5,Mohamed Salah,AM/S RL,30,185,187,Egypt,Liverpool,12,14,17,...,8750,North_African__Middle_Eastern,181,9,1992/6/15,85,47,405971.0,,98028755


In [48]:
# Check the shape of the data
df.shape

(8452, 98)

In [49]:
# Check all columns
df.columns

Index(['Name', 'Position', 'Age', 'ca', 'pa', 'Nationality', 'Club', 'Corners',
       'Crossing', 'Dribbling', 'Finishing', 'First Touch', 'Free Kick Taking',
       'Heading', 'Long Shots', 'Long Throws', 'Marking', 'Passing',
       'Penalty Taking', 'Tackling', 'Technique', 'Aggressiion',
       'Anticipation', 'Bravery', 'Composure', 'Concentration', 'Vision',
       'Decision', 'Determination', 'Flair', 'Leadership', 'Off The Ball',
       'Position.1', 'Teamwork', 'Work Rate', 'Acceleration', 'Agility',
       'Balance', 'Jumping Reach', 'Natural Fitness', 'Pace', 'Stamina',
       'Strength', 'Stability', 'Foul', 'Contest performance', 'Injury',
       'diversity', 'Aerial Reach', 'Command Of Area', 'Communication',
       'Eccentricity', 'Handling', 'Kicking', 'One On Ones', 'Reflexes',
       'Rushing Out', 'Punching', 'Throwing', 'Adaptation', 'Ambition',
       'Argue', 'Loyal', 'Resistant to stress', 'Professional',
       'Sportsmanship', 'Emotional control', 'GK', 'DL', 

### Choose the Columns

In [50]:
# Get columns that are needed
columns = [
    'Name', 'Position', 'Age', 'ca', 'pa', 'Corners',
    'Crossing', 'Dribbling', 'Finishing', 'First Touch', 'Free Kick Taking',
    'Heading', 'Long Shots', 'Long Throws', 'Marking', 'Passing',
    'Penalty Taking', 'Tackling', 'Technique', 'Aggressiion',
    'Anticipation', 'Bravery', 'Composure', 'Concentration', 'Vision',
    'Decision', 'Determination', 'Flair', 'Leadership', 'Off The Ball',
    'Position.1', 'Teamwork', 'Work Rate', 'Acceleration', 'Agility',
    'Balance', 'Jumping Reach', 'Natural Fitness', 'Pace', 'Stamina',
    'Strength', 'Stability', 'Foul', 'Contest performance', 'Injury',
    'diversity', 'Aerial Reach', 'Command Of Area', 'Communication',
    'Eccentricity', 'Handling', 'Kicking', 'One On Ones', 'Reflexes',
    'Rushing Out', 'Punching', 'Throwing', 'Adaptation', 'Ambition',
    'Argue', 'Loyal', 'Resistant to stress', 'Professional',
    'Sportsmanship', 'Emotional control', 'GK', 'DL', 'DC', 'DR', 'WBL',
    'WBR', 'DM', 'ML', 'MC', 'MR', 'AML', 'AMC', 'AMR', 'ST'
]

# Create the new DataFrame with only the selected columns
df = df[columns]
df.head()

Unnamed: 0,Name,Position,Age,ca,pa,Corners,Crossing,Dribbling,Finishing,First Touch,...,WBL,WBR,DM,ML,MC,MR,AML,AMC,AMR,ST
1,Kevin De Bruyne,M/AM RLC,31,189,189,14,19,15,16,16,...,1,1,12,15,20,15,14,20,14,12
2,Kylian Mbappé,AM/S RL,23,188,197,13,13,18,17,18,...,1,1,1,10,1,10,19,1,17,20
3,Robert Lewandowski,S,33,186,190,3,8,13,19,18,...,1,1,1,1,1,1,10,12,8,20
4,Erling Haaland,S,22,185,195,7,10,14,18,16,...,1,1,1,1,1,1,1,1,1,20
5,Mohamed Salah,AM/S RL,30,185,187,12,14,17,17,17,...,1,1,1,1,1,1,17,12,20,19


In [51]:
# Check the updated shape of the data
df.shape

(8452, 79)

In [52]:
# Check the updated columns
df.columns

Index(['Name', 'Position', 'Age', 'ca', 'pa', 'Corners', 'Crossing',
       'Dribbling', 'Finishing', 'First Touch', 'Free Kick Taking', 'Heading',
       'Long Shots', 'Long Throws', 'Marking', 'Passing', 'Penalty Taking',
       'Tackling', 'Technique', 'Aggressiion', 'Anticipation', 'Bravery',
       'Composure', 'Concentration', 'Vision', 'Decision', 'Determination',
       'Flair', 'Leadership', 'Off The Ball', 'Position.1', 'Teamwork',
       'Work Rate', 'Acceleration', 'Agility', 'Balance', 'Jumping Reach',
       'Natural Fitness', 'Pace', 'Stamina', 'Strength', 'Stability', 'Foul',
       'Contest performance', 'Injury', 'diversity', 'Aerial Reach',
       'Command Of Area', 'Communication', 'Eccentricity', 'Handling',
       'Kicking', 'One On Ones', 'Reflexes', 'Rushing Out', 'Punching',
       'Throwing', 'Adaptation', 'Ambition', 'Argue', 'Loyal',
       'Resistant to stress', 'Professional', 'Sportsmanship',
       'Emotional control', 'GK', 'DL', 'DC', 'DR', 'WBL', 'WBR

### Negative PA → Positive PA Approximate Mapping
| Negative PA | Range (Positive PA) | Approx. (average) |
| ----------- | ------------------- | ----------------- |
| -95         | 170–200             | **185**           |
| -90         | 160–190             | **175**           |
| -85         | 150–180             | **165**           |
| -80         | 140–170             | **155**           |
| -75         | 130–160             | **145**           |
| -70         | 120–150             | **135**           |
| -65         | 110–140             | **125**           |
| -60         | 100–130             | **115**           |
| -55         | 90–120              | **105**           |
| -50         | 80–110              | **95**            |
| -45         | 70–100              | **85**            |
| -40         | 60–90               | **75**            |
| -35         | 50–80               | **65**            |
| -30         | 40–70               | **55**            |
| -25         | 30–60               | **45**            |
| -20         | 20–50               | **35**            |
| -15         | 10–40               | **25**            |
| -10         | 1–30                | **15**            |

In [53]:
# Filter rows where 'pa' is negative
negative_pa_df = df[df['pa'] < 0]

# Show the first few rows with negative PA values
before = negative_pa_df[['Name', 'pa']].sample(10, random_state=42)

before

Unnamed: 0,Name,pa
6809,Noel López,-8
7291,Isaac Babadi,-85
7300,Nelson Palacio,-75
8247,João Conceição,-75
7946,Martin Georgiev,-75
7703,Hugo Félix,-75
8087,Zoumana Diallo,-75
7237,Fran Pérez,-8
7699,Tiago Coser,-75
7329,Harib Abdalla,-75


In [54]:
# Fix incorrectly shortened negative PA values
correction_map = {
    -1: -10, -2: -20, -3: -30, -4: -40, -5: -50,
    -6: -60, -7: -70, -8: -80, -9: -90
}

df['pa'] = df['pa'].replace(correction_map)

In [55]:
# Convert negative PA values to estimated positive equivalents
def convert_pa_to_estimated(pa):
    mapping = {
        -95: 185, -90: 175, -85: 165, -80: 155, -75: 145,
        -70: 135, -65: 125, -60: 115, -55: 105, -50: 95,
        -45: 85, -40: 75, -35: 65, -30: 55, -25: 45,
        -20: 35, -15: 25, -10: 15
    }
    return mapping.get(pa, pa)

df['pa'] = df['pa'].apply(convert_pa_to_estimated)

In [56]:
# Match by name to get updated values for the same sample and compare it
after = df[df['Name'].isin(before['Name'])][['Name', 'pa']]

comparison = before.merge(after, on='Name', suffixes=('_before', '_after'))
comparison.index = comparison.index + 1
print(comparison)

               Name  pa_before  pa_after
1        Noel López         -8       155
2      Isaac Babadi        -85       165
3    Nelson Palacio        -75       145
4    João Conceição        -75       145
5   Martin Georgiev        -75       145
6        Hugo Félix        -75       145
7    Zoumana Diallo        -75       145
8        Fran Pérez         -8       155
9       Tiago Coser        -75       145
10    Harib Abdalla        -75       145


## Attribute Tiering

### Range of Tier for Player's Age
| Age Range | Tier                     |
| --------- | ------------------------ |
| 15–18     | Youth / Wonderkid        |
| 19–21     | Young Talent             |
| 22–25     | Developing / Early Prime |
| 26–29     | Prime                    |
| 30–32     | Veteran / Experienced    |
| 33–35     | Aging                    |
| 36+       | Near Retirement          |

In [57]:
# Tier of a player based on Age
def get_tier_age(age: int) -> str:
    if pd.isnull(age):
        return "Unknown"
    elif 15 <= age <= 18:
        return "Youth/Wonderkid"
    elif 19 <= age <= 21:
        return "Young Talent"
    elif 22 <= age <= 25:
        return "Developing/Early Prime"
    elif 26 <= age <= 29:
        return "Prime"
    elif 30 <= age <= 32:
        return "Veteran/Experienced"
    elif 33 <= age <= 35:
        return "Aging"
    elif age >= 36:
        return "Near Retirement"
    else:
        return "Unknown"

In [58]:
# Select 3 sample players
sample_rows = df.iloc[[17, 10, 1]]

# Test the age tier for each player
print("Age Tier Test:\n")
for idx, row in sample_rows.iterrows():
    name = row["Name"]
    age = row["Age"]
    stage = get_tier_age(age)
    print(f"{name} (Age {age}) → {stage}")

Age Tier Test:

Rodri (Age 26) → Prime
Luka Modrić (Age 36) → Near Retirement
Kylian Mbappé (Age 23) → Developing/Early Prime


### Range of Tier for Player's Current Rating
| CA Range | Tier        |
| -------- | ----------- |
| 0–59     | Very Poor   |
| 60–89    | Poor        |
| 90–119   | Decent      |
| 120–139  | Good        |
| 140–159  | Very Good   |
| 160–179  | Excellent   |
| 180–200  | WorlClass |


In [59]:
# Tier of a player based on Current Ability (CA)
def get_tier_ca(ca: int) -> str:
    if pd.isnull(ca):
        return "Unknown"
    elif ca <= 59:
        return "Very Poor"
    elif ca <= 89:
        return "Poor"
    elif ca <= 119:
        return "Decent"
    elif ca <= 139:
        return "Good"
    elif ca <= 159:
        return "Very Good"
    elif ca <= 179:
        return "Excellent"
    elif ca <= 200:
        return "WorldClass"
    else:
        return "Unknown"

In [60]:
# Select 3 sample players
sample_rows = df.iloc[[13, 7, 2]] 

# Test the CA tier for each player
print("CA Tier Test:\n")
for idx, row in sample_rows.iterrows():
    name = row["Name"]
    ca = row["ca"]
    ca_tier = get_tier_ca(ca)
    print(f"{name} (CA {ca}) → {ca_tier}")

CA Tier Test:

Alisson (CA 177) → Excellent
Thibaut Courtois (CA 181) → WorldClass
Robert Lewandowski (CA 186) → WorldClass


### Range of Tier for Player's Potential Rating
| PA Range | Tier        |
| -------- | ----------- |
| 0–59     | Very Poor   |
| 60–89    | Poor        |
| 90–119   | Decent      |
| 120–139  | Good        |
| 140–159  | Very Good   |
| 160–179  | Excellent   |
| 180–200  | WorldClass |

In [61]:
# Tier of a player based on Potential Ability (PA)
def get_tier_pa(pa: int) -> str:
    if pd.isnull(pa):
        return "Unknown"
    elif pa <= 59:
        return "Very Poor"
    elif pa <= 89:
        return "Poor"
    elif pa <= 119:
        return "Decent"
    elif pa <= 139:
        return "Good"
    elif pa <= 159:
        return "Very Good"
    elif pa <= 179:
        return "Excellent"
    elif pa <= 200:
        return "WorldClass"
    else:
        return "Unknown"

In [62]:
# Select 3 sample players
sample_rows = df.iloc[[21, 212, 2121]] 

# Test the PA tier for each player
print("PA Tier Test:\n")
for idx, row in sample_rows.iterrows():
    name = row["Name"]
    pa = row["pa"]
    pa_tier = get_tier_pa(pa)
    print(f"{name} (PA {pa}) → {pa_tier}")

PA Tier Test:

Heung-Min Son (PA 173) → Excellent
Youri Tielemans (PA 170) → Excellent
Rodrigo Rey (PA 135) → Good


### Range of Tier for  Player's Attributes
| Attribute Value | Color Code        | Tier          |
|-----------------|-------------------|---------------|
| 1–4             | White / Light Grey| Abysmal       |
| 5–7             | Light Yellow      | Very Poor     |
| 8–10            | Yellow            | Poor          |
| 11–13           | Orange            | Decent        |
| 14–15           | Light Green       | Good          |
| 16–17           | Green             | Very Good     |
| 18–19           | Dark Green        | Excellent     |
| 20              | Darkest Green     | World-Class   |

In [63]:
# Tier of a player based on Attributes
def get_tier_attributes(value: int) -> str:
    if pd.isnull(value):
        return "Unknown"
    elif value <= 4:
        return "Abysmal"
    elif value <= 7:
        return "Very Poor"
    elif value <= 10:
        return "Poor"
    elif value <= 13:
        return "Decent"
    elif value <= 15:
        return "Good"
    elif value <= 17:
        return "Very Good"
    elif value <= 19:
        return "Excellent"
    elif value == 20:
        return "World-Class"
    else:
        return "Unknown"

In [64]:
# Select 3 sample players
sample_rows = df.iloc[[2, 12, 91]]

# Test the attributes tier for each player
attributes_to_test = ["Finishing", "Pace", "Passing", "Tackling", "Handling"]

print("Attribute Tier Test:")
for idx, row in sample_rows.iterrows():
    name = row["Name"]
    print(f"\n{name}")
    for attr in attributes_to_test:
        value = row[attr]
        tier = get_tier_attributes(value)
        print(f"  {attr}: {value} → {tier}")

Attribute Tier Test:

Robert Lewandowski
  Finishing: 19 → Excellent
  Pace: 14 → Good
  Passing: 13 → Decent
  Tackling: 7 → Very Poor
  Handling: 3 → Abysmal

Neymar
  Finishing: 16 → Very Good
  Pace: 15 → Good
  Passing: 17 → Very Good
  Tackling: 5 → Very Poor
  Handling: 2 → Abysmal

Theo Hernández
  Finishing: 13 → Decent
  Pace: 18 → Excellent
  Passing: 13 → Decent
  Tackling: 14 → Good
  Handling: 1 → Abysmal


### Top Attributes for each Position Group
| **Position Group** | **Typical Positions** | **Key Attributes**                                                                                                                                                                                                                                 | **Bonus Attributes (Optional if High)**                                                                                                             |
| ------------------ | --------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- |
| **Attacker**       | ST, AML, AMR, CF      | - **Finishing** (Accuracy in front of goal)  <br> - **Off The Ball** (Movement to find space)  <br> - **First Touch** (Control under pressure)  <br> - **Pace** (Speed to beat defenders)  <br> - **Composure** (Calmness when shooting)           | - **Dribbling** (Ability to beat defenders)  <br> - **Technique** (Overall ball control quality) <br> - **Anticipation** (Predicting ball movement) |
| **Midfielder**     | MC, DM, AMC           | - **Passing** (Ability to distribute accurately)  <br> - **Vision** (Spotting key passes)  <br> - **Work Rate** (Effort in both attack & defense)  <br> - **Decision** (Choosing the best action)  <br> - **Technique** (Skill in executing plays) | - **Tackling** (For defensive mids)  <br> - **Long Shots** (For attacking mids)  <br> - **Teamwork** (Coordination with teammates)                  |
| **Defender**       | DC, DL, DR, WBL, WBR  | - **Tackling** (Winning challenges cleanly)  <br> - **Marking** (Sticking to opponents)  <br> - **Positioning** (Reading the game)  <br> - **Strength** (Physical duels)  <br> - **Heading** (Aerial dominance)                                    | - **Pace** (For full-backs)  <br> - **Concentration** (Focus during play)  <br> - **Bravery** (Willingness to block or challenge)                   |
| **Goalkeeper**     | GK                    | - **Reflexes** (Quick reaction saves)  <br> - **Handling** (Secure catches)  <br> - **One On Ones** (Stopping breakaways)  <br> - **Command Of Area** (Controlling the box)  <br> - **Aerial Reach** (Claiming crosses)                            | - **Kicking** (Distribution skill)  <br> - **Composure** (Calmness under pressure)                                                                  |
   |


In [65]:
# Define key and bonus attributes per position group
POSITION_GROUPS = {
    "Attacker": {
        "positions": ['ST', 'AML', 'AMR', 'CF'],
        "key": ['Finishing', 'Off The Ball', 'First Touch', 'Pace', 'Composure'],
        "bonus": ['Dribbling', 'Technique', 'Anticipation']
    },
    "Midfielder": {
        "positions": ['MC', 'DM', 'AMC'],
        "key": ['Passing', 'Vision', 'Work Rate', 'Decision', 'Technique'],
        "bonus": ['Tackling', 'Long Shots', 'Teamwork']
    },
    "Defender": {
        "positions": ['DC', 'DL', 'DR', 'WBL', 'WBR'],
        "key": ['Tackling', 'Marking', 'Position.1', 'Strength', 'Heading'],
        "bonus": ['Pace', 'Concentration', 'Bravery']
    },
    "Goalkeeper": {
        "positions": ['GK'],
        "key": ['Reflexes', 'Handling', 'One On Ones', 'Command Of Area', 'Aerial Reach'],
        "bonus": ['Kicking', 'Composure']
    }
}

In [66]:
# Determine player's primary position group based on positional ratings
def get_player_group(row) -> str:
    for group, data in POSITION_GROUPS.items():
        for pos in data['positions']:
            if pos in row and row[pos] >= 15:
                return group
    return "Unknown"

In [67]:
# Select 3 sample players
sample_rows = df.iloc[[19, 7, 90]]

# Test the position group for each player
print("Position Group Test:\n")
for idx, row in sample_rows.iterrows():
    name = row["Name"]
    group = get_player_group(row)
    print(f"{name} → {group}")

Position Group Test:

Marc-André ter Stegen → Goalkeeper
Thibaut Courtois → Goalkeeper
João Félix → Attacker


### Scout Summary Category

| PA Range       | Age  | CA Compared to PA | CA Range | Scout Category        |
| -------------- | ---- | ----------------- | -------- | --------------------- |
| ≥ 160          | ≤ 21 | —                 | —        | **Top Talent**        |
| ≥ 160          | ≤ 26 | CA < PA           | —        | **High Potential**    |
| ≥ 160          | > 26 | CA ≥ PA           | —        | **Established Star**  |
| ≥ 140 and <160 | ≤ 26 | CA < PA           | —        | **Promising Player**  |
| ≥ 140 and <160 | —    | CA ≥ 140          | —        | **Key Player**        |
| ≥ 140 and <160 | —    | else              | —        | **Reliable Starter**  |
| <140           | —    | —                 | ≥ 120    | **Squad Contributor** |
| <140           | —    | —                 | ≥ 90     | **Depth Option**      |
| <140           | —    | —                 | < 90     | **Limited Role**      |


In [68]:
# Define a scout category based on CA, PA, and Age
def get_scout_summary(ca: int, pa: int, age: int) -> str:
    if pd.isnull(ca) or pd.isnull(pa) or pd.isnull(age):
        return "Uncategorized"

    # Young elite talent
    if pa >= 160:
        if age <= 21:
            return "Top Talent"
        elif ca < pa and age <= 26:
            return "High Potential"
        else:
            return "Established Star"

    # Good PA players
    elif pa >= 140:
        if ca < pa and age <= 26:
            return "Promising Player"
        elif ca >= 140:
            return "Key Player"
        else:
            return "Reliable Starter"

    # Average professionals
    elif ca >= 120:
        return "Squad Contributor"
    elif ca >= 90:
        return "Depth Option"

    # Older or weaker players
    return "Limited Role"

In [69]:
# Select 3 sample players
sample_rows = df.iloc[[1701, 1710, 1017]]

# Test the scout category for each player
print("Scout Summary Category Test:\n")
for idx, row in sample_rows.iterrows():
    name = row["Name"]
    ca = row["ca"]
    pa = row["pa"]
    age = row["Age"]
    summary = get_scout_summary(ca, pa, age)
    print(f"{name} (CA: {ca}, PA: {pa}, Age: {age}) → {summary}")

Scout Summary Category Test:

Guus Til (CA: 131, PA: 138, Age: 24) → Squad Contributor
Juan Pablo Vargas (CA: 131, PA: 135, Age: 27) → Squad Contributor
Marouane Fellaini (CA: 136, PA: 150, Age: 34) → Reliable Starter


## Scouting Report Template Generation

### Prepare Input for Fine-tuning a T5-style Model

In [70]:
# Turn a player's features into a structured input string for training
def make_input_features(row) -> str:
    fields = [f"{col}: {row[col]}" for col in row.index if pd.notnull(row[col])]
    return "; ".join(fields)

### Template for the Scout Report

In [71]:
# Define template for the scout report
def generate_scouting_report(row) -> str:
    name = row.get("Name", "Unknown Player")
    age = row.get("Age")
    ca = row.get("ca")
    pa = row.get("pa")
    original_position = row.get("Position", "Unknown Position")
    age_stage = get_tier_age(age)
    ca_tier = get_tier_ca(ca)
    pa_tier = get_tier_pa(pa)
    scout_category = get_scout_summary(ca, pa, age)

    # Use to select correct attribute set
    position_group = get_player_group(row)

    if position_group not in POSITION_GROUPS:
        return f"{name} has no clearly defined role, making scouting difficult."

    key_attrs = POSITION_GROUPS[position_group]["key"]
    bonus_attrs = POSITION_GROUPS[position_group]["bonus"]

    # Describe key strengths
    key_descriptions = []
    for attr in key_attrs:
        value = row.get(attr)
        tier = get_tier_attributes(value)
        if tier != "Unknown":
            key_descriptions.append(f"{attr.lower()} rated {tier.lower()}")

    # Describe bonus skills (only if high enough)
    bonus_descriptions = []
    for attr in bonus_attrs:
        value = row.get(attr)
        tier = get_tier_attributes(value)
        if tier in ["Very Good", "Excellent", "World-Class"]:
            bonus_descriptions.append(f"{attr.lower()} ({tier.lower()})")

    # Development potential line
    if ca < pa and age <= 26:
        potential_line = "He still has room to grow and could reach a higher level with the right development."
    elif ca < pa and age > 26:
        potential_line = "While there's some room between his current and potential ability, his age suggests limited upside."
    elif ca == pa:
        potential_line = "He appears to be playing close to his full potential."
    else:
        potential_line = "His current ability may suggest he's past his peak."

    # Define multiple templates
    templates = [
        f"{name} is a {age_stage.lower()} player that plays as {position_group.lower()} in the position {original_position} who currently shows {ca_tier.lower()} ability (CA: {ca}), with a potential ceiling of {pa_tier.lower()} (PA: {pa}).",
        f"Playing as a {position_group.lower()} in the position {original_position}, {name} is a {age_stage.lower()} player showing {ca_tier.lower()} quality (CA: {ca}) and could reach {pa_tier.lower()} (PA: {pa}).",
        f"{name}, aged {age}, features as a {position_group.lower()} ({original_position}). Currently rated {ca_tier.lower()} (CA: {ca}), he may develop into a {pa_tier.lower()} (PA: {pa}) level player."]
    
    intro = random.choice(templates)

    # Attribute summary
    if key_descriptions:
        attributes_summary = f" Key strengths include: " + ", ".join(key_descriptions) + "."
    else:
        attributes_summary = ""

    if bonus_descriptions:
        bonus_summary = f" He also shows strong ability in areas like " + ", ".join(bonus_descriptions) + "."
    else:
        bonus_summary = ""

    # Final line
    summary_line = f" Overall, he can be categorized as a {scout_category.lower()}."

    # Combine all parts
    report = intro + " " + potential_line + attributes_summary + bonus_summary + summary_line

    return report.strip()

### Utlize a T5-based paraphrasing model

In [72]:
# Load the paraphrasing model
paraphrase_model_name = "ramsrigouthamg/t5_paraphraser"
paraphrase_tokenizer = T5Tokenizer.from_pretrained(paraphrase_model_name)
paraphrase_model = AutoModelForSeq2SeqLM.from_pretrained(paraphrase_model_name).to(device)

In [73]:
# Define the paraphrasing function
def paraphrase_text(text, num_return_sequences=1, num_beams=4, max_length=256):
    input_text = "paraphrase: " + text + " </s>"

    encoding = paraphrase_tokenizer(
        input_text,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        outputs = paraphrase_model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            num_beams=num_beams,
            num_return_sequences=num_return_sequences,
            early_stopping=True
        )

    return paraphrase_tokenizer.decode(outputs[0], skip_special_tokens=True)

In [74]:
# Define the batch paraphrasing function
def batch_paraphrase_texts(texts, batch_size=16):
    outputs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Paraphrasing (GPU)"):
        batch = texts[i:i+batch_size]
        outputs.extend([paraphrase_text(t) for t in batch])
    return outputs

### Make New Dataset for Fine-tuning a T5-style Model

In [75]:
# Create the original input text and generated report
df["Input Text"] = df.apply(make_input_features, axis=1)
df["Generated Report"] = df.apply(generate_scouting_report, axis=1)

# Sample 25% of rows to paraphrase
df_aug = df.sample(frac=0.25, random_state=42).copy()
tqdm.pandas()
df_aug["Generated Report"] = batch_paraphrase_texts(df_aug["Generated Report"].tolist(), batch_size=8)

# Combine paraphrased with original data for more diversity
df_final = pd.concat([df, df_aug], ignore_index=True)

Paraphrasing (GPU): 100%|██████████████████████████████| 265/265 [9:32:14<00:00, 129.56s/it]


In [76]:
df_final[["Input Text", "Generated Report"]].to_csv("scouting_finetune_dataset3.csv", index=False)

In [82]:
# Load and check the new dataset
df = pd.read_csv("scouting_finetune_dataset3.csv")

sample_df = df_final.sample(n=3).reset_index(drop=True)
sample_df.index = sample_df.index + 1

display(HTML(sample_df[["Input Text", "Generated Report"]].to_html(escape=False)))

Unnamed: 0,Input Text,Generated Report
1,Name: Andreaw Gravillon; Position: D C; Age: 24; ca: 120; pa: 139; Corners: 6; Crossing: 6; Dribbling: 6; Finishing: 6; First Touch: 11; Free Kick Taking: 5; Heading: 14; Long Shots: 9; Long Throws: 5; Marking: 14; Passing: 10; Penalty Taking: 10; Tackling: 13; Technique: 11; Aggressiion: 13; Anticipation: 12; Bravery: 14; Composure: 11; Concentration: 12; Vision: 7; Decision: 13; Determination: 13; Flair: 6; Leadership: 10; Off The Ball: 6; Position.1: 13; Teamwork: 12; Work Rate: 12; Acceleration: 13; Agility: 13; Balance: 14; Jumping Reach: 13; Natural Fitness: 16; Pace: 13; Stamina: 15; Strength: 14; Stability: 13; Foul: 7; Contest performance: 12; Injury: 2; diversity: 10; Aerial Reach: 2; Command Of Area: 2; Communication: 2; Eccentricity: 1; Handling: 1; Kicking: 2; One On Ones: 3; Reflexes: 2; Rushing Out: 1; Punching: 4; Throwing: 2; Adaptation: 11; Ambition: 12; Argue: 10; Loyal: 10; Resistant to stress: 11; Professional: 12; Sportsmanship: 12; Emotional control: 8; GK: 1; DL: 1; DC: 20; DR: 10; WBL: 1; WBR: 1; DM: 1; ML: 1; MC: 1; MR: 1; AML: 1; AMC: 1; AMR: 1; ST: 1,"Andreaw Gravillon is a developing/early prime player that plays as defender in the position D C who currently shows good ability (CA: 120), with a potential ceiling of good (PA: 139). He still has room to grow and could reach a higher level with the right development. Key strengths include: tackling rated decent, marking rated good, position.1 rated decent, strength rated good, heading rated good. Overall, he can be categorized as a squad contributor."
2,Name: Rafael Gava; Position: M/AM C; Age: 29; ca: 120; pa: 124; Corners: 11; Crossing: 13; Dribbling: 13; Finishing: 12; First Touch: 14; Free Kick Taking: 8; Heading: 7; Long Shots: 10; Long Throws: 5; Marking: 6; Passing: 13; Penalty Taking: 13; Tackling: 7; Technique: 13; Aggressiion: 14; Anticipation: 13; Bravery: 9; Composure: 13; Concentration: 12; Vision: 12; Decision: 12; Determination: 14; Flair: 10; Leadership: 13; Off The Ball: 13; Position.1: 7; Teamwork: 14; Work Rate: 14; Acceleration: 13; Agility: 13; Balance: 11; Jumping Reach: 9; Natural Fitness: 12; Pace: 13; Stamina: 13; Strength: 8; Stability: 13; Foul: 11; Contest performance: 11; Injury: 5; diversity: 14; Aerial Reach: 3; Command Of Area: 3; Communication: 2; Eccentricity: 3; Handling: 3; Kicking: 1; One On Ones: 1; Reflexes: 1; Rushing Out: 3; Punching: 3; Throwing: 1; Adaptation: 12; Ambition: 14; Argue: 10; Loyal: 10; Resistant to stress: 11; Professional: 13; Sportsmanship: 13; Emotional control: 10; GK: 1; DL: 10; DC: 1; DR: 1; WBL: 10; WBR: 1; DM: 1; ML: 12; MC: 18; MR: 1; AML: 12; AMC: 20; AMR: 12; ST: 1,"Playing as a midfielder in the position M/AM C, Rafael Gava is a prime player showing good quality (CA: 120) and could reach good (PA: 124). While there's some room between his current and potential ability, his age suggests limited upside. Key strengths include: passing rated decent, vision rated decent, work rate rated good, decision rated decent, technique rated decent. Overall, he can be categorized as a squad contributor."
3,Name: Ulrik Saltnes; Position: M/AM C; Age: 29; ca: 126; pa: 129; Corners: 8; Crossing: 9; Dribbling: 10; Finishing: 11; First Touch: 10; Free Kick Taking: 12; Heading: 11; Long Shots: 12; Long Throws: 6; Marking: 12; Passing: 12; Penalty Taking: 15; Tackling: 12; Technique: 12; Aggressiion: 13; Anticipation: 15; Bravery: 13; Composure: 13; Concentration: 15; Vision: 12; Decision: 14; Determination: 18; Flair: 10; Leadership: 15; Off The Ball: 14; Position.1: 13; Teamwork: 16; Work Rate: 16; Acceleration: 10; Agility: 11; Balance: 13; Jumping Reach: 13; Natural Fitness: 16; Pace: 10; Stamina: 14; Strength: 13; Stability: 16; Foul: 12; Contest performance: 11; Injury: 5; diversity: 16; Aerial Reach: 1; Command Of Area: 3; Communication: 1; Eccentricity: 4; Handling: 2; Kicking: 1; One On Ones: 1; Reflexes: 1; Rushing Out: 2; Punching: 1; Throwing: 2; Adaptation: 7; Ambition: 17; Argue: 7; Loyal: 17; Resistant to stress: 11; Professional: 17; Sportsmanship: 7; Emotional control: 8; GK: 1; DL: 1; DC: 15; DR: 1; WBL: 1; WBR: 1; DM: 15; ML: 1; MC: 20; MR: 1; AML: 1; AMC: 16; AMR: 1; ST: 11,"Ulrik Saltnes, aged 29, features as a midfielder (M/AM C). Currently rated good (CA: 126), he may develop into a good (PA: 129) level player. While there's some room between his current and potential ability, his age suggests limited upside. Key strengths include: passing rated decent, vision rated decent, work rate rated very good, decision rated good, technique rated decent. He also shows strong ability in areas like teamwork (very good). Overall, he can be categorized as a squad contributor."


## Model Fine-Tuning 

In [13]:
# Load and split the dataset
df = pd.read_csv("scouting_finetune_dataset3.csv")
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert to HuggingFace Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}")

Train size: 8452, Validation size: 2113


In [14]:
# Load tokenizer and model
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
# Preprocessing function
def preprocess(example):
    input_text = "generate report: " + example["Input Text"]
    target_text = example["Generated Report"]
    tokenized = tokenizer(
        input_text, max_length=512, truncation=True, padding="max_length"
    )
    tokenized["labels"] = tokenizer(
        target_text, max_length=256, truncation=True, padding="max_length"
    )["input_ids"]
    return tokenized

In [16]:
# Tokenize dataset
tokenized_train = train_dataset.map(preprocess, remove_columns=train_dataset.column_names)
tokenized_val = val_dataset.map(preprocess, remove_columns=val_dataset.column_names)

Map: 100%|█████████████████████████████████████████████████████████████████| 8452/8452 [01:16<00:00, 110.07 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████| 2113/2113 [00:16<00:00, 127.13 examples/s]


In [17]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./t5-scouting-model",
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=100,
    save_steps=100,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2, 
    num_train_epochs=10, 
    logging_steps=20,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",  
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    warmup_steps=200,
    weight_decay=0.01,
    report_to="tensorboard", 
    fp16=True  
)



In [18]:
# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [21]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [22]:
# Save & train the model
train_output = trainer.train()
print(train_output)

Step,Training Loss,Validation Loss
100,3.8314,2.146342
200,1.1536,0.778524
300,0.4625,0.278002
400,0.296,0.168988
500,0.2222,0.134116
600,0.1865,0.108673
700,0.1539,0.096845
800,0.145,0.090375
900,0.1294,0.084942
1000,0.1173,0.081436


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=5280, training_loss=0.27471077241229286, metrics={'train_runtime': 8117.5756, 'train_samples_per_second': 10.412, 'train_steps_per_second': 0.65, 'total_flos': 1.1428803083501568e+16, 'train_loss': 0.27471077241229286, 'epoch': 9.990539262062441})


In [23]:
# Extract training metrics
metrics = train_output.metrics
final_loss = train_output.training_loss
steps = train_output.global_step
epochs = metrics.get("epoch", 0)
runtime = metrics.get("train_runtime", 0)
samples_per_sec = metrics.get("train_samples_per_second", 0)
steps_per_sec = metrics.get("train_steps_per_second", 0)
best_eval_loss = metrics.get("eval_loss")
best_model_path = trainer.state.best_model_checkpoint

print("Fine-Tuning Summary")
print(f"Training stopped after {steps} steps (≈{epochs:.2f} epochs).")
print(f"Final training loss: {final_loss:.4f}")
if best_eval_loss is not None:
    print(f"Best evaluation loss: {best_eval_loss:.4f}")
print(f"Total training time: {runtime:.1f} seconds (~{runtime/60:.1f} minutes)")
print(f"Training throughput: {samples_per_sec:.2f} samples/sec, {steps_per_sec:.2f} steps/sec)")
if best_model_path:
    print(f"Best model was saved at: {best_model_path}")

Fine-Tuning Summary
Training stopped after 5280 steps (≈9.99 epochs).
Final training loss: 0.2747
Total training time: 8117.6 seconds (~135.3 minutes)
Training throughput: 10.41 samples/sec, 0.65 steps/sec)
Best model was saved at: ./t5-scouting-model\checkpoint-5200


In [24]:
# Save the Fine-tuned model
trainer.save_model("t5-scouting-model")
tokenizer.save_pretrained("t5-scouting-model")

('t5-scouting-model\\tokenizer_config.json',
 't5-scouting-model\\special_tokens_map.json',
 't5-scouting-model\\spiece.model',
 't5-scouting-model\\added_tokens.json')

## Model Evaluation

In [43]:
# Load the fine-tuned model
model = T5ForConditionalGeneration.from_pretrained("t5-scouting-model").to(device)
tokenizer = T5Tokenizer.from_pretrained("t5-scouting-model")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [31]:
# Load the dataset
df = pd.read_csv("scouting_finetune_dataset3.csv")
val_df = df.sample(frac=0.1, random_state=42)

# Prepare evaluation datasets
references = []
predictions = []

In [40]:
# Compute ROUGE and BLEU
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

# Evaluation loop
for _, row in tqdm(val_df.iterrows(), total=len(val_df), desc="Evaluating"):
    input_text = "generate report: " + row["Input Text"]
    ref_text = row["Generated Report"]

    # Tokenize and generate
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        output_ids = model.generate(**inputs, max_length=256)
        pred_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    predictions.append(pred_text)
    references.append(ref_text)

rouge_result = rouge.compute(predictions=predictions, references=references, use_stemmer=True)
bleu_result = bleu.compute(predictions=predictions, references=[[ref] for ref in references])

Downloading builder script: 100%|█████████████████████████████████████████████████| 5.94k/5.94k [00:00<00:00, 3.41MB/s]
Downloading extra modules: 4.07kB [00:00, ?B/s]                                                                        
Downloading extra modules: 100%|██████████████████████████████████████████████████████████| 3.34k/3.34k [00:00<?, ?B/s]
Evaluating: 100%|████████████████████████████████████████████████████████████████| 1056/1056 [2:17:37<00:00,  7.82s/it]


In [42]:
# Show evaluation results
print("\nEvaluation Results:")
print("ROUGE-1:", f"{rouge_result['rouge1']:.2f}")
print("ROUGE-2:", f"{rouge_result['rouge2']:.2f}")
print("ROUGE-L:", f"{rouge_result['rougeL']:.2f}")
print("BLEU:", f"{bleu_result['bleu']:.2f}")


Evaluation Results:
ROUGE-1: 0.85
ROUGE-2: 0.74
ROUGE-L: 0.80
BLEU: 0.76


### Model Testing

In [3]:
# Load the dataset & fine-tuned model
df = pd.read_csv("scouting_finetune_dataset3.csv")

model = T5ForConditionalGeneration.from_pretrained("t5-scouting-model").to(device)
tokenizer = T5Tokenizer.from_pretrained("t5-scouting-model")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
# Randomly sample 1 row
samples = df.sample(n=1)

# Collect data
results = []

# Evaluation loop
for _, row in samples.iterrows():
    input_text = row["Input Text"]
    original_report = row["Generated Report"]
    
    # Tokenize and generate
    inputs = tokenizer("generate report: " + input_text, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        output_ids = model.generate(**inputs, max_length=256)
        generated_report = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Append to result
    results.append({
        "Input Text": textwrap.shorten(input_text, width=300, placeholder=" ..."),
        "Generated Report": generated_report
    })

# Visualize the results
comparison_df = pd.DataFrame(results)
pd.set_option('display.max_colwidth', None)
display(comparison_df)

Unnamed: 0,Input Text,Generated Report
0,Name: Rodrigo Márquez; Position: AM RL; Age: 20; ca: 115; pa: 145; Corners: 10; Crossing: 11; Dribbling: 13; Finishing: 12; First Touch: 14; Free Kick Taking: 12; Heading: 12; Long Shots: 14; Long Throws: 10; Marking: 8; Passing: 13; Penalty Taking: 13; Tackling: 10; Technique: 11; Aggressiion: ...,"Rodrigo Márquez is a young talent player that plays as attacker in the position AM RL who currently shows decent ability (CA: 115), with a potential ceiling of very good (PA: 145). He still has room to grow and could reach a higher level with the right development. Key strengths include: finishing rated decent, off the ball rated decent, first touch rated good, pace rated decent, composure rated good. Overall, he can be categorized as a promising player."


In [15]:
# Randomly sample 1 row
samples = df.sample(n=1)

# Collect data
results = []

# Evaluation loop
for _, row in samples.iterrows():
    input_text = row["Input Text"]
    original_report = row["Generated Report"]
    
    # Tokenize and generate
    inputs = tokenizer("generate report: " + input_text, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        output_ids = model.generate(**inputs, max_length=256)
        generated_report = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Append to result
    results.append({
        "Input Text": textwrap.shorten(input_text, width=300, placeholder=" ..."),
        "Generated Report": generated_report
    })

# Visualize the results
comparison_df = pd.DataFrame(results)
pd.set_option('display.max_colwidth', None)
display(comparison_df)

Unnamed: 0,Input Text,Generated Report
0,Name: Youcef Belaïli; Position: M/AM L; Age: 30; ca: 136; pa: 145; Corners: 11; Crossing: 13; Dribbling: 15; Finishing: 13; First Touch: 15; Free Kick Taking: 12; Heading: 8; Long Shots: 14; Long Throws: 4; Marking: 7; Passing: 14; Penalty Taking: 12; Tackling: 7; Technique: 15; Aggressiion: 13; ...,"Playing as a attacker in the position M/AM L, Youcef Belali is a veteran/experienced player showing good quality (CA: 136) and could reach good (PA: 145). He still has room to grow and could reach a higher level with the right development. Key strengths include: finishing rated decent, off the ball rated decent, first touch rated good, pace rated good, composure rated decent. Overall, he can be categorized as a squad contributor."


In [34]:
# Randomly sample 1 row
samples = df.sample(n=1)

# Collect data
results = []

# Evaluation loop
for _, row in samples.iterrows():
    input_text = row["Input Text"]
    original_report = row["Generated Report"]
    
    # Tokenize and generate
    inputs = tokenizer("generate report: " + input_text, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        output_ids = model.generate(**inputs, max_length=256)
        generated_report = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Append to result
    results.append({
        "Input Text": textwrap.shorten(input_text, width=300, placeholder=" ..."),
        "Generated Report": generated_report
    })

# Visualize the results
comparison_df = pd.DataFrame(results)
pd.set_option('display.max_colwidth', None)
display(comparison_df)

Unnamed: 0,Input Text,Generated Report
0,Name: Carlos Alcaraz; Position: M/AM C; Age: 19; ca: 120; pa: 155; Corners: 12; Crossing: 11; Dribbling: 13; Finishing: 13; First Touch: 14; Free Kick Taking: 12; Heading: 7; Long Shots: 12; Long Throws: 9; Marking: 10; Passing: 14; Penalty Taking: 7; Tackling: 9; Technique: 14; Aggressiion: 6; ...,"Carlos Alcaraz, aged 19, features as a attacker (M/AM C). Currently rated good (CA: 120), he may develop into a good (PA: 155) level player. While there's some room between his current and potential ability, his age suggests limited upside. Key strengths include: finishing rated decent, off the ball rated good, first touch rated good, pace rated decent, composure rated poor. Overall, he can be categorized as a squad contributor."
