In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##Load and process dataset

In [2]:
import pandas as pd
import json
import os

# --- Configuration ---
REPO_BASE = "https://raw.githubusercontent.com/ArkadiusDS/MIPD/master/data"
OUTPUT_DIR = "/content/drive/MyDrive"

# Mapping input filenames (without extension) to output suffixes
# Key = CSV file name on GitHub, Value = Suffix for local JSONL file
# e.g., 'validation' reads 'validation.csv' and saves 'mipd_val.jsonl'
files_map = {
    "train": "train",
    "validation": "val",
    "test": "test"
}

# Global variable to store technique columns (detected from the training set)
TECHNIQUES_COLUMNS = []

def format_record(row, techniques_cols):
    """Przetwarza jeden wiersz DataFrame na format treningowy."""

    # 1. WyciƒÖgnij aktywne techniki (gdzie warto≈õƒá w kolumnie jest True/1)
    # Using the passed techniques_cols list to ensure consistency
    active_techniques = [col for col in techniques_cols if row[col]]

    # 2. Stw√≥rz Target (Output) w formacie JSON z Markdown
    json_content = json.dumps({"discovered_techniques": active_techniques}, ensure_ascii=False)
    output_str = f"```json\n{json_content}\n```"

    # 3. Zwr√≥ƒá s≈Çownik w formacie dla Unsloth
    return {
        "input": row['article'],
        "output": output_str
    }

# --- Main Processing Loop ---

for input_name, output_suffix in files_map.items():

    # 1. Construct URL
    url = f"{REPO_BASE}/manipulation/{input_name}.csv"
    print(f"\n--- Processing: {input_name} ---")
    print(f"Downloading data from: {url}")

    try:
        # 2. Load Data
        df = pd.read_csv(url)
        print(f"Successfully loaded {len(df)} records.")

        # 3. Detect Columns (Only need to do this once, typically on the train set)
        # We assume the structure is identical across files.
        if not TECHNIQUES_COLUMNS:
            try:
                start_col = df.columns.get_loc('REFERENCE_ERROR')
                end_col = df.columns.get_loc('QUOTE_MINING') + 1
                TECHNIQUES_COLUMNS = df.columns[start_col:end_col].tolist()
                print(f"Defined manipulation techniques columns ({len(TECHNIQUES_COLUMNS)}): {TECHNIQUES_COLUMNS}")
            except KeyError as e:
                print(f"Critical Error: Could not find technique columns in {input_name}. Missing: {e}")
                continue

        # Optional: Print info for verification
        # df.info()

        # 4. Apply Formatting
        # We use a lambda to pass the specific columns list to the function
        print("Formatting records...")
        formatted_df = df.apply(
            lambda row: format_record(row, TECHNIQUES_COLUMNS),
            axis=1,
            result_type='expand'
        )

        # 5. Save to JSONL
        output_file = os.path.join(OUTPUT_DIR, f"mipd_{output_suffix}.jsonl")
        formatted_df.to_json(output_file, orient='records', lines=True, force_ascii=False)

        print(f"Saved {len(formatted_df)} records to: {output_file}")
        print("Sample record:", formatted_df.iloc[0].to_dict())

    except Exception as e:
        print(f"Error processing {input_name}: {e}")


--- Processing: train ---
Downloading data from: https://raw.githubusercontent.com/ArkadiusDS/MIPD/master/data/manipulation/train.csv
Successfully loaded 10749 records.
Defined manipulation techniques columns (11): ['REFERENCE_ERROR', 'WHATABOUTISM', 'STRAWMAN', 'EMOTIONAL_CONTENT', 'CHERRY_PICKING', 'FALSE_CAUSE', 'MISLEADING_CLICKBAI', 'ANECDOTE', 'LEADING_QUESTIONS', 'EXAGGERATION', 'QUOTE_MINING']
Formatting records...
Saved 10749 records to: /content/drive/MyDrive/mipd_train.jsonl
Sample record: {'input': '"Terapia homoseksualizmu ‚Äì szansa czy oszustwo? - Strona ≈ªycia" "Osoby te sƒÖ przypuszczalnie sterowane przez ≈õrodowiska gejowskie, ale te≈º przez dziennikarzy, kt√≥rzy szukajƒÖ gorƒÖcego medialnego tematu. Np. redaktor Jan J√≥zefowicz z telewizji WTK jako gej szukajƒÖcy pomocy przyszed≈Ç do poradni z ukrytym nadajnikiem, kt√≥ry obs≈Çugiwa≈Ça bƒôdƒÖca w pobli≈ºu jego redakcyjna kole≈ºanka. Jednocze≈õnie jest przecie≈º spora grupa os√≥b, kt√≥re naprawdƒô szukajƒÖ pomocy w pr

##Inspect processed records

In [3]:
import pandas as pd

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

first_record = formatted_df.iloc[1].to_dict()
print(f"Input:\n{first_record['input']}")
print(f"Output:\n{first_record['output']}")

Input:
"Migranci porwali statek i p≈Çynƒôli w stronƒô Malty. Interweniowa≈Çy si≈Çy zbrojne" "Szef w≈Çoskiego MSW Matteo Salvini przekaza≈Ç na Twitterze, ≈ºe u wybrze≈ºy Libii zosta≈Ç porwany turecki tankowiec El Hiblu 1. Kontrolƒô nad nim przejƒô≈Ço 108 imigrant√≥w, w tym 77 mƒô≈ºczyzn i 31 kobiet, kt√≥rzy chcieli, by okrƒôt obra≈Ç kurs na w≈ÇoskƒÖ wyspƒô Lampedusa i Maltƒô. Zgodnie z planem, statek mia≈Ç kierowaƒá siƒô w stronƒô Libii. RzƒÖdy obu kraj√≥w poczƒÖtkowo zadeklarowa≈Çy, ≈ºe bƒôdƒÖ trzymaƒá statek z dala od w√≥d terytorialnych Morza ≈ör√≥dziemnego. Agencja AFP przekaza≈Ça jednak, ≈ºe malta≈Ñskie si≈Çy zbrojne eskortowa≈Çy statek w kierunku swojego portu. Rzecznik armii przekazywa≈Ç w rozmowie z ‚ÄûMalta Today‚Äù, ≈ºe pr√≥by nawiƒÖzania kontaktu z kapitanem tankowca ko≈Ñczy≈Çy siƒô fiaskiem. Najnowsze informacje Najnowszych doniesie≈Ñ w sprawie dostarczy≈Ç ‚ÄûThe Telegraph‚Äù. Z informacji przekazanych przez korespondenta z Rzymu wynika, ≈ºe malta≈Ñskie si≈Çy zbrojne wesz≈Çy

In [5]:
import json

def has_empty_techniques(output_str):
    """Checks if the 'discovered_techniques' list in the output string is empty."""
    try:
        # Extract the JSON part from the markdown string
        json_part = output_str.replace('```json\n', '').replace('\n```', '')
        data = json.loads(json_part)
        return not bool(data.get('discovered_techniques')) # Returns True if list is empty or not found
    except (json.JSONDecodeError, AttributeError):
        return True # Consider malformed or missing as empty

# Filter out records where 'discovered_techniques' is empty
original_count = len(formatted_df)

# Create a boolean mask to identify rows with empty techniques
empty_techniques_mask = formatted_df['output'].apply(has_empty_techniques)

# Filter the DataFrame: keep only rows where empty_techniques_mask is False
filtered_empty_labels_df = formatted_df[~empty_techniques_mask].copy()

filtered_count = len(filtered_empty_labels_df)

print(f"Original records in formatted_df: {original_count}")
print(f"Records after filtering out empty labels: {filtered_count}")
print(f"Number of records removed: {original_count - filtered_count}")

if not filtered_empty_labels_df.empty:
    print("\nSample record from filtered dataset (with non-empty labels):")
    sample_record = filtered_empty_labels_df.iloc[0].to_dict()
    print(f"Input:\n{sample_record['input']}")
    print(f"Output:\n{sample_record['output']}")
else:
    print("\nNo records found with non-empty labels after filtering.")

Original records in formatted_df: 1521
Records after filtering out empty labels: 373
Number of records removed: 1148

Sample record from filtered dataset (with non-empty labels):
Input:
"Minister Ziobro: Polska musi odrzuciƒá unijny pakiet klimatyczny!" "03.01.22, 08:55Fot. P. Tracz via Flickr, (KPRM), CC 0 Minister Ziobro: Polska musi odrzuciƒá unijny pakiet klimatyczny! ‚ÄûPolska musi odrzuciƒá unijny pakiet klimatyczny, wtedy rachunki za prƒÖd spadnƒÖ o 60%‚Äù - napisa≈Ç minister sprawiedliwo≈õci Zbigniew Ziobro na Twitterze. Szef resortu sprawiedliwo≈õci podkre≈õli≈Ç, ≈ºe za ‚Äûszale≈ÑczƒÖ politykƒô klimatycznƒÖ UE odpowiada Europejska Partia Ludowa. Dalej przypomnia≈Ç, ≈ºe na jej czele stoi Donald Tusk. Na koniec doda≈Ç: ‚ÄûSolidarna Polska jest za racjonalnƒÖ ochronƒÖ klimatu. Pakietu ruiny nie chcemy!‚Äù dam/twitter,Fronda.pl Dlaczego moje dzieci i wnuki majƒÖ ≈ºyƒá w syfie? Bo tak siƒô Panu podoba? Miliardy na propagandƒô to macie.Pomys≈Ç ze zniesieniem op≈Çat emisyjnych dla Eu

##Filter out more than 16k tokens records from training dataset(GPU needed)

In [None]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2

In [None]:
from unsloth import FastLanguageModel
from google.colab import userdata
import torch

max_seq_length = 16384
base_model_dir = "drive/MyDrive/bielik-4.5b-base"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = base_model_dir,
    max_seq_length = max_seq_length, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    use_gradient_checkpointing = "unsloth",
)

NotImplementedError: Unsloth cannot find any torch accelerator? You need a GPU.

In [None]:
def get_token_length(text):
    """Calculates token length using the loaded tokenizer."""
    if not isinstance(text, str):
        return 0
    return len(tokenizer.encode(text))

TOKEN_LIMIT = 16128

print(f"Filtering datasets to ensure combined 'instruction' + 'input' token length is less than {TOKEN_LIMIT} tokens, using the loaded tokenizer.")

In [None]:
import pandas as pd

# Apply filtering to the training DataFrame
formatted_df['combined_token_length'] = formatted_df.apply(lambda row: get_token_length(row['instruction']) + get_token_length(row['input']), axis=1)
filtered_formatted_df = formatted_df[formatted_df['combined_token_length'] < TOKEN_LIMIT].copy()
print(f"Original formatted_df records: {len(formatted_df)}")
print(f"Filtered formatted_df records (token length < {TOKEN_LIMIT}): {len(filtered_formatted_df)}")

# Drop the temporary 'combined_token_length' column
filtered_formatted_df = filtered_formatted_df.drop(columns=['combined_token_length'])

# Save the filtered training dataset to a new JSONL file
output_file_16k = "/content/drive/MyDrive/mipd_train_16k.jsonl"
filtered_formatted_df.to_json(output_file_16k, orient='records', lines=True, force_ascii=False)

print(f"Zapisano {len(filtered_formatted_df)} rekord√≥w do pliku: {output_file_16k}")
print("Przyk≈Çadowy rekord z przefiltrowanego zbioru:", filtered_formatted_df.iloc[0].to_dict())

##Create mini training set for mlops testing

In [None]:
import pandas as pd
import os

# Define paths
OUTPUT_DIR = "/content/drive/MyDrive"
input_file = os.path.join(OUTPUT_DIR, "mipd_train_cot_clean.jsonl")
output_file = os.path.join(OUTPUT_DIR, "mlops_testing_train_cot.jsonl")

try:
    # Load the full dataset
    full_df = pd.read_json(input_file, lines=True)
    print(f"Successfully loaded {len(full_df)} records from {input_file}")

    # Pick 30 random samples
    if len(full_df) >= 30:
        mini_df = full_df.sample(n=30, random_state=42) # Using random_state for reproducibility
        print(f"Selected 30 random samples.")
    else:
        mini_df = full_df.copy()
        print(f"Dataset has less than 30 records. Using all {len(mini_df)} records.")

    # Save the mini dataset
    mini_df.to_json(output_file, orient='records', lines=True, force_ascii=False)
    print(f"Saved {len(mini_df)} records to: {output_file}")

    # Display a sample record from the mini dataset
    if not mini_df.empty:
        print("Sample record from mlops_testing_train_cot:", mini_df.iloc[0].to_dict())

except FileNotFoundError:
    print(f"Error: The file {input_file} was not found. Please ensure it exists in your Google Drive.")
except Exception as e:
    print(f"An error occurred: {e}")

Successfully loaded 9903 records from /content/drive/MyDrive/mipd_train_cot_clean.jsonl
Selected 10 random samples.
Saved 30 records to: /content/drive/MyDrive/mlops_testing_train_cot.jsonl
Sample record from mlops_testing_train_cot: {'input': '"W Europie podpalane sƒÖ maszty. Powodem teorie spiskowe na temat sieci 5G" "Teoria m√≥wiƒÖca o szkodliwo≈õci sieci 5G i rozprzestrzenianiu przez niƒÖ koronawirusa przyczyni≈Ça siƒô do podpalenia dw√≥ch maszt√≥w pod Amsterdamem - wynika z ustale≈Ñ tamtejszej policji. Do zdarzenia dosz≈Ço w trakcie ≈õwiƒÖt wielkanocnych. Co ciekawe, zaatakowane maszty nie mia≈Çy nadajnika 5G. ‚Äì To bardzo niebezpieczne, niszczona jest wa≈ºna infrastruktura ‚Äì ostrzeg≈Ç Rob Bongenaar, dyrektor holenderskiego zrzeszenia telekom√≥w Monet. Zw≈Çaszcza ≈ºe ≈ºadne badania nie wykaza≈Çy zwiƒÖzku miƒôdzy telefoniƒÖ kom√≥rkowƒÖ a obni≈ºeniem odporno≈õci u cz≈Çowieka, a fale radiowe nie tworzƒÖ ani nie rozprzestrzeniajƒÖ wirus√≥w. Dezinformacja na temat 5G Ogromnym powodz