In [None]:
import tarfile
import io
import zipfile
import importlib
import regex as re
import pyperclip  
import TexSoup as TS
from TexSoup.tokens import MATH_ENV_NAMES
import os

def find_doc_class(fp, name_match=False):
    '''Search for document class related lines in a file and return a code to represent the type'''
    doc_class_pat = re.compile(r"^\s*\\document(?:style|class)")
    sub_doc_class = re.compile(r"^\s*\\document(?:style|class).*(?:\{standalone\}|\{subfiles\})")

    # Read the content as bytes
    file_content = fp.read()
    try:
        # Try decoding with UTF-8
        file_text = file_content.decode('utf-8')
    except UnicodeDecodeError:
        # Fallback to latin-1 encoding if UTF-8 fails
        file_text = file_content.decode('latin-1')

    for line in file_text.splitlines():
        if doc_class_pat.search(line):
            if name_match:
                if sub_doc_class.search(line):
                    return -99999
                return 1  # Found document class line
    return 0  # No document class line found

def find_main_tex_source_in_tar(tar_file, encoding='utf-8'):
    tex_names = set(["paper", "main", "ms.", "article"])
    tex_files = [f for f in tar_file.getnames() if f.endswith('.tex')]

    if len(tex_files) == 1:
        return tex_files[0]

    main_files = {}
    for tf in tex_files:
        depth = len(tf.split('/')) - 1
        has_main_name = any(kw in tf for kw in tex_names)
        fp = tar_file.extractfile(tf)
        if fp:
            main_files[tf] = find_doc_class(fp, name_match=has_main_name) - depth
            fp.close()

    return max(main_files, key=main_files.get) if main_files else None

def pre_format(text):
    source_text = text.replace('\\}\\', '\\} \\').replace(')}', ') }').replace(')$', ') $')
    return source_text

def source_from_tar(tar_file, encoding='utf-8'):
    tex_main = find_main_tex_source_in_tar(tar_file, encoding=encoding)
    if tex_main:
        fp = tar_file.extractfile(tex_main)
        if fp is not None:
            file_content = fp.read()  # Read as bytes to keep it in memory
            try:
                # Attempt to decode using UTF-8
                source_text = pre_format(file_content.decode(encoding))
            except UnicodeDecodeError:
                # Fallback to latin-1 encoding if UTF-8 fails
                source_text = pre_format(file_content.decode('latin-1'))
            return source_text
    return None



def extract_before_abstract(source_text):
    no_comments_text = re.sub(r'(?<!\\)%.*', '', source_text)
    no_usepackage_text = re.sub(r'\\usepackage\s*\{[^}]+\}', '', no_comments_text)
    text = re.sub(r'\\[a-zA-Z]+\{[^}]*\}', '', no_usepackage_text)
    text = re.sub(r'\\[a-zA-Z]+\[[^\]]*\]\{[^}]*\}', '', no_usepackage_text)
    text = re.sub(r'\$[^$]*\$', '', no_usepackage_text)
    text = no_usepackage_text.replace('{', '').replace('}', '').replace('\n', ' ')
    text = ' '.join(no_usepackage_text.split())
    abstract_match = re.search(r'\\begin\s*\{\s*abstract\s*\}', text)

    if abstract_match:
        return text[:abstract_match.start()].strip()
    
    abstract_word_match = re.search(r'\babstract\b', text, re.IGNORECASE)
    if abstract_word_match:
        return text[:abstract_word_match.start()].strip()
    return None

zip_file_path = "./2401.zip"

with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
    tar_files = [f for f in zip_file.namelist() if f.endswith('.tar.gz')]

    for tar_name in tar_files:
        with zip_file.open(tar_name) as tar_bytes:
            tar_file = tarfile.open(fileobj=io.BytesIO(tar_bytes.read()), mode='r:gz')
            source_text = source_from_tar(tar_file)
            if source_text:
                pyperclip.copy(source_text)
                content_before_abstract = extract_before_abstract(source_text)
                if content_before_abstract:
                    print(f"Content before abstract in {tar_name}:\n{content_before_abstract}\n")
                else:
                    print(f"No abstract found in {tar_name}, or no content before abstract.\n")
            tar_file.close()


Content before abstract in 2401/2401.00006v1.tar.gz:
\documentclass{article} \newcommand{\theHalgorithm}{\arabic{algorithm}} \usepackage[accepted]{icml2024} \usepackage[capitalize,noabbrev]{cleveref} \theoremstyle{plain} \newtheorem{theorem}{Theorem}[section] \newtheorem{proposition}[theorem]{Proposition} \newtheorem{lemma}[theorem]{Lemma} \newtheorem{corollary}[theorem]{Corollary} \theoremstyle{definition} \newtheorem{definition}[theorem]{Definition} \newtheorem{assumption}[theorem]{Assumption} \theoremstyle{remark} \newtheorem{remark}[theorem]{Remark} \usepackage[textsize=tiny]{todonotes} \usepackage[bb=boondox]{mathalfa} \usepackage[T1]{fontenc} \usepackage[normalem]{ulem} \useunder{\uline}{\ul}{} \newcommand{\fix}{\marginpar{FIX}} \newcommand{\new}{\marginpar{NEW}} \newcommand{\framework}{OpenContra} \newcommand{\Contra}{Contra} \newcommand{\bfemph}[1]{\textbf{\textit{#1}}} \newcommand{\ming}[1]{{\bf \color{orange} [[Ming says ``#1'']]}} \newcommand{\qi}[1]{{\bf \color{blue} [[Qi s

In [None]:
# Read input from a text file and filter out files without valid content before the abstract
input_file_path = 'input.txt'  # Replace with your actual file path
output_file_path = 'filtered_files_with_content.txt'

# Variables to keep track of statistics
total_files_count_author = 0
valid_files_count = 0
valid_files_with_content = []

# Reading and processing the input file
with open(input_file_path, 'r') as infile:
    content_blocks = infile.read().split('Content before abstract in ')
    
    for block in content_blocks:
        if block.strip():  # Ensure we are not processing an empty block
            total_files_count_author += 1
            lines = block.split('\n', 1)  # Split to separate the file name from its content
            if len(lines) > 1:
                file_name = lines[0].strip().replace(':', '')
                content = lines[1].strip()
                
                # Check if the content does not indicate "No abstract found"
                if 'No abstract found' not in content and 'no content before abstract' not in content.lower():
                    valid_files_count += 1
                    valid_files_with_content.append((file_name, content))

# Writing the filtered results to an output file
with open(output_file_path, 'w') as outfile:
    for file_name, content in valid_files_with_content:
        outfile.write(f"Content before abstract in {file_name}:\n{content}\n\n")

# Print or save the statistics summary
# print(f"Total number of files processed: {total_files_count}")
# print(f"Total number of files with valid content before abstract: {valid_files_count}")
# print(f"Filtered output saved in: {output_file_path}")


2184

2240

855

497

908

825

1377

2401

4351

5882

1130

1116

1396

1396

1404

1393

939

939

2999

1754

1300

1639

804

1542

1904

17990

460

638

2743

1137

2325

2285

1924

1338

1537

2307

2374

5175

1725

433

1300

2838

988

988

1852

962

980

1115

974

2524

533

1129

968

2348

1262

1274

1416

1416

1489

913

3043

3043

4435

767

3446

3512

3726

1506

1628

4035

2739

1007

1670

821

2919

712

4059

4078

1777

1771

3011

749

890

1213

1213

2426

1319

2136

1510

1510

1849

745

745

718

1205

972

972

972

1112

1406

971

3099

1119

647

4611

822

2217

2313

1692

961

1518

1518

1518

1518

1518

1023

2419

1818

1303

956

1359

4323

573

573

1930

1930

1930

733

935

1309

772

772

772

1535

377

394

394

420

338

1070

6516

1136

1280

5929

1118

2622

3178

2008

2077

1968

2079

2483

1645

657

657

942

941

1947

1574

701

2347

1411

1411

1661

1199

774

1378

902

902

1205

8318

3527

1376

1793

1038

1038

3384

865

917

1905

1905

686

1646

440

2697

1335

4494

1483

1585

875

880

1131

438

742

1992

1014

1074

2080

2080

2080

1308

2822

419

2781

2781

1220

1022

5315

5330

2166

1173

808

808

2466

2466

1997

2210

1194

1018

2052

1475

1380

2029

2593

1325

1887

1887

4290

1132

889

2528

840

3419

933

1041

1369

1369

882

1363

1428

845

6144

1228

648

902

901

1561

1190

1662

2319

2145

2551

1904

5079

2739

2065

1538

10868

10985

10985

919

1488

1099

1559

1727

1469

1469

939

1355

1361

992

3119

2992

2992

951

596

608

5843

1163

907

2962

1326

1055

1482

2088

718

578

12571

897

897

1320

2692

844

844

10330

1011

791

815

860

1347

1407

1609

7380

7380

2464

1338

498

1349

1737

2138

822

10025

1235

24062

1339

7625

486

1646

1859

8629

1751

1543

837

2987

1373

498

1594

1593

1104

1908

556

1932

1018

1507

1831

2316

8003

837

659

1391

2214

1142

686

1331

1449

1604

1049

3317

1112

1107

539

1720

675

3106

670

3881

1941

3257

1062

1062

1728

1283

1172

2611

2101

1574

2742

1312

1273

450

664

1645

1457

1929

2123

1856

3163

562

1374

1495

3029

3029

1534

1406

1229

1229

4268

2094

1007

1007

624

1459

281

281

787

745

2605

1263

2701

4639

1471

1303

1493

637

1484

1312

2342

1263

2027

2326

3476

641

862

2335

10704

2273

1872

645

1131

1434

3886

843

878

3515

1332

843

747

1251

874

874

891

533

533

826

1393

981

685

1459

442

12238

3161

622

1021

2214

2214

1305

2621

2623

1574

1574

4857

2517

2421

1293

2753

1737

2707

637

1047

5983

5983

3040

1883

2723

2723

4874

2074

1266

956

1364

906

1282

13737

1611

616

1284

1520

1212

944

1274

756

579

1861

1480

537

1239

4085

3298

982

1165

1240

1240

1886

2593

965

1029

1024

919

1181

3392

1003

893

3615

2196

2927

2927

693

693

693

2623

476

697

645

855

822

290

1239

2142

1383

2461

5568

3909

1593

546

847

476

474

2750

1715

3178

1933

1826

1208

4919

593

3473

1055

1525

1541

498

1447

1447

1447

1077

1523

934

3387

861

863

3475

1605

4158

1355

1590

2506

5569

3695

681

5452

1537

584

1642

4602

4609

1597

1535

1914

441

223

1365

2196

1153

1273

1273

1273

5005

3898

780

780

780

2929

1891

2459

2163

1058

1058

1309

543

2230

1092

3338

3054

3018

3018

1075

7721

2596

2528

6569

1489

1918

4372

944

6151

1329

1474

587

843

1555

1240

491

1529

1471

6515

455

1809

1357

1703

3136

1773

166

1130

1163

1560

1484

849

978

1522

3478

3350

1506

1760

2099

772

2676

718

1464

6095

510

510

510

807

2666

534

1212

552

1415

1498

1463

1870

1870

1870

1870

2847

3295

1645

3343

1208

818

1622

1685

19779

19779

2240

1280

9301

1354

1354

1248

1716

1020

5926

690

2344

639

639

639

4762

1311

1362

1609

2507

5474

943

23393

1841

717

780

416

2554

1142

1138

3452

1574

3034

4573

1735

3596

1757

922

1294

968

2436

873

1463

774

3581

1125

812

1284

4753

604

1474

2667

5676

1670

795

2186

2107

594

1415

1855

1557

1557

1831

1092

694

850

3425

812

479

479

499

499

2276

2276

2879

2879

1106

2011

2694

2291

5539

2193

2216

4121

3807

1793

833

1029

1887

650

760

780

728

2098

9270

9258

5321

5321

781

2740

7720

897

947

669

1114

2706

383

2710

715

482

618

973

963

588

588

1479

1631

3805

606

1224

1660

1778

1446

1316

764

852

1615

1014

2413

969

969

5963

752

2918

1507

2130

928

1546

517

530

530

2692

1343

2340

2340

2340

532

2060

1725

1726

1287

1326

856

908

908

3674

1081

2946

708

1431

830

1220

1682

1467

1467

1467

860

1455

1033

1679

661

3316

3316

1841

1542

2424

320

3511

692

758

905

1335

640

634

364

1256

1256

995

5110

1148

1148

956

448

684

4251

589

1641

1380

1604

923

1775

904

1178

1593

2082

3523

4371

1804

776

1073

1424

432

1050

6266

1595

1630

1882

1882

1281

3251

1289

1181

799

1648

555

1233

5476

886

1945

1841

1841

1395

2777

1579

1659

4353

741

1965

2014

3948

2531

2008

3524

603

1222

598

1307

865

2464

685

886

1090

765

1564

2014

648

591

4115

2472

2720

2720

1419

927

1203

1201

992

2337

1527

16359

1870

1870

2532

1133

2066

535

1374

1608

1810

476

2219

2577

781

781

1253

400

1848

823

1889

1889

1537

5024

1959

515

958

958

698

891

1350

1611

2259

1823

634

622

981

971

3041

2305

534

1726

1772

1374

1193

1059

924

1845

1518

362

2649

1360

846

2240

1462

865

758

723

723

2261

1471

2523

1589

1066

3773

1140

1218

3317

2546

870

924

7141

2134

2592

8704

1605

1605

1543

2112

1935

1052

398

1281

983

1190

3787

504

1852

1958

2337

766

5691

3233

1103

1057

1070

1401

656

1650

1590

1500

677

358

2168

1031

4026

832

1403

948

691

320

763

716

716

1238

769

2330

1571

1556

893

1595

1419

617

822

1516

1028

549

643

735

2385

3131

3131

3093

3566

598

948

1575

1445

1509

2728

685

1934

867

2591

2504

2219

618

1442

755

1103

835

1908

1169

1079

3093

3127

1771

1761

899

2952

3467

1350

1939

772

1372

1241

1006

1731

1104

698

871

861

1126

1768

4984

1556

1067

1455

1738

1972

1703

7641

628

662

662

662

999

573

1384

624

682

930

652

1813

5510

5629

573

1181

1019

2083

1025

759

1527

3067

1300

727

947

1121

6973

688

1481

1481

1254

3118

2836

2439

1104

407

797

1268

1768

2842

660

788

4484

2146

1439

2546

860

779

696

133

133

961

1643

1524

1256

2250

1884

1884

4634

991

1636

1453

2201

1188

807

2433

949

2703

7613

459

459

783

2619

1116

323

735

1508

1630

2717

1400

490

3180

1074

1210

1315

1462

739

1227

1967

965

587

915

418

2291

2430

1298

1646

1066

2870

2704

2704

1151

832

1991

1514

3050

2242

1293

1280

3103

3103

2136

1465

862

862

453

737

737

1502

1770

988

3128

821

1672

1351

802

802

1947

1948

1769

1769

2114

2256

836

594

1913

1505

1731

649

815

844

3535

3702

2133

506

1014

1002

1137

987

1856

1868

805

805

1011

1223

1223

1826

967

1154

1078

1078

1494

3444

2471

1154

5291

5907

1364

1364

1336

1224

4513

2231

1223

1714

1490

1670

876

310

1383

1964

1964

800

1351

1013

830

1081

939

1738

1738

1738

768

598

598

1301

1820

1316

2728

1080

1034

2369

1010

4815

638

624

881

2201

722

1792

1249

2446

2732

4051

1103

1103

1504

1261

794

1417

620

2874

512

1822

740

871

735

501

1929

2148

960

1045

776

869

1358

1727

1525

1229

889

889

1452

625

938

1197

1121

1620

847

1472

1445

2453

2453

2425

8186

289

811

2334

2334

1040

1115

1384

1382

1602

754

1028

1071

1161

879

1515

1374

1091

1176

651

2468

2505

1930

1078

2864

1386

1275

778

745

1323

878

744

484

1280

930

1079

1315

1700

1358

2880

1148

2132

1664

2312

2935

990

1037

1037

547

547

1396

965

859

538

1457

809

1742

3432

855

852

622

496

427

427

427

7983

1506

1605

1190

3091

873

2176

1737

1307

1130

3346

1623

1767

1998

1329

1329

1329

885

885

891

504

1463

5154

1420

738

776

776

2100

1135

757

761

3918

848

1267

1704

1511

2150

1415

987

1468

2205

1733

534

3775

721

721

508

1977

1977

1977

5490

518

997

2266

1700

1085

1111

2577

233

695

1952

963

1224

1572

2091

4689

1294

489

548

548

2623

1443

672

445

1058

1058

1407

1691

3904

1549

12448

944

944

1249

1113

1908

2709

867

1918

1396

1404

1356

1016

3195

2225

1491

1006

2614

330

660

1399

752

1152

1895

978

1209

871

1721

1168

2612

1163

520

2748

698

695

9360

3008

3064

1253

1183

922

627

742

841

2084

775

775

1174

932

542

4195

1535

555

744

1338

3200

1101

1087

1088

3070

1406

1566

1899

1605

2367

565

1220

2658

961

2344

1259

4041

784

1509

916

1497

1437

1834

1826

1221

895

3953

4597

2237

999

1506

262

804

833

2585

611

974

598

1700

1747

1795

2235

2235

2235

2275

1076

5610

2212

1114

925

1413

1407

802

1802

1047

3291

1538

1149

2591

1184

561

531

531

531

735

1126

860

3059

1135

1135

1403

1143

1168

713

429

2091

574

1088

1094

2574

1514

1463

1058

1823

1527

564

1710

1078

458

481

1240

704

939

1616

1646

8226

1387

3732

2443

939

460

1061

674

4755

4924

698

13445

13445

942

1239

3066

1406

2560

1159

1163

780

742

781

1127

1188

1494

1773

947

1079

1132

1134

1380

1235

446

415

573

1284

3289

1156

1882

1239

1712

1859

1139

813

813

2077

502

2414

1644

1195

3349

1243

3897

3897

1227

2737

2737

2765

2554

664

1297

1297

1357

3070

13925

580

1603

1603

1702

1702

1702

2714

4024

1436

796

796

692

1136

3786

1014

945

1853

3227

1789

1226

1137

1093

325

325

3849

3849

1608

1591

2224

1419

1419

1312

1649

1684

3581

1590

20604

2874

1257

1536

1585

2029

2069

2069

648

1424

1407

1098

948

1827

2200

9002

1363

5521

1041

730

1451

3125

2752

681

964

642

1667

1883

1343

1045

273

2800

738

612

1450

1719

1628

2378

2378

759

2424

1046

2349

1177

3834

2070

2478

1232

1865

395

1219

1308

574

893

2791

1506

1506

1859

2340

1038

1796

1601

1277

1497

579

6507

1035

1035

443

1819

1359

793

3387

1443

2092

2109

1850

952

2159

3730

5973

521

521

1244

2517

1030

1030

442

442

607

5171

1975

1708

968

1638

730

554

2205

2249

2205

4415

4414

687

8698

1627

1978

1978

842

775

1792

1189

2346

1401

1240

891

1937

1959

4116

1129

1621

1921

668

4823

22006

543

2401

348

348

348

3018

737

2848

1078

1306

937

1424

1873

4874

820

939

841

2006

709

709

939

1348

5636

761

1287

742

3306

2260

2366

881

2082

1216

1366

653

1425

1250

2248

2248

1824

5433

3124

3124

1260

1856

1942

1060

1584

4553

6072

1855

2148

2261

1892

1856

1856

2421

1587

9665

1804

1592

677

742

2559

877

1510

1777

1599

3501

3172

3129

3227

1239

881

849

1572

2951

1894

942

1219

3178

1550

1021

1144

860

3166

1490

1490

682

1787

347

843

1043

1643

708

5058

1494

1498

3526

2026

2367

1428

4390

1074

9724

9724

9724

1145

1312

495

2060

2060

2606

2269

2419

5605

731

1769

924

473

1496

1457

1691

2162

1447

1705

2299

2299

2210

1940

805

1760

893

1460

1937

1263

1264

1150

729

1647

1478

1887

865

865

3042

2370

4049

1919

2681

1947

880

880

1014

1682

4008

473

1186

1441

1020

647

647

2545

2755

1722

2079

1115

3357

2695

1326

1822

3559

2145

2145

883

455

2045

3321

2023

1688

959

959

2614

896

1238

2035

1921

746

6813

3918

1664

3339

1308

858

858

1680

4659

517

1722

1567

2385

691

631

749

3180

2364

2474

2197

796

1626

1822

1098

1174

1486

783

1758

1000

1536

1563

1563

2060

3235

4778

856

460

1601

1675

8236

1082

1428

2502

1598

2048

1742

383

1056

803

968

890

1234

1240

1126

2328

884

884

2246

1945

1273

723

917

2382

4721

1817

In [32]:
# Define the input and output file paths
input_file_path = 'filtered_files_with_content.txt'
output_file_path = 'filtered_without_tags.txt'

# Tags to search for and their counters
tags_to_search = [r'\\affiliations', r'\\affiliation', r'\\icmlaffiliation',  r'\\institute', r'\\affil', r'\\aff', r'\\AFF',r'\\address']
tag_counts = {tag: 0 for tag in tags_to_search}

# Helper function to check and remove content with specified tags
def contains_and_remove_tags(content, tags):
    for tag in tags:
        if re.search(tag, content):
            tag_counts[tag] += 1
            return True  # Stop at the first match and remove the paper's content
    return False

# Process the input and create the new output without specified tags
total_files_count = 0
files_kept_count = 0
files_without_tags = []

with open(input_file_path, 'r') as infile:
    content_blocks = infile.read().split('Content before abstract in ')

    for block in content_blocks:
        if block.strip():  # Ensure we are not processing an empty block
            total_files_count += 1
            lines = block.split('\n', 1)
            if len(lines) > 1:
                file_name = lines[0].strip().replace(':', '')
                content = lines[1].strip()

                # Check if the content includes any of the tags and remove if found
                if not contains_and_remove_tags(content, tags_to_search):
                    files_kept_count += 1
                    files_without_tags.append(f"Content before abstract in {file_name}:\n{content}\n")

# Write the filtered content to the output file
with open(output_file_path, 'w') as outfile:
    outfile.write('\n'.join(files_without_tags))

# Print statistics
print(f"Total number of files processed: {total_files_count}")
for tag, count in tag_counts.items():
    print(f"Number of papers with tag '{tag}': {count}")
print(f"Total number of files kept after filtering: {files_kept_count}")
print(f"Filtered output saved in: {output_file_path}")


1717501

Total number of files processed: 2134
Number of papers with tag '\\affiliations': 107
Number of papers with tag '\\affiliation': 593
Number of papers with tag '\\icmlaffiliation': 54
Number of papers with tag '\\institute': 107
Number of papers with tag '\\affil': 125
Number of papers with tag '\\aff': 7
Number of papers with tag '\\AFF': 7
Number of papers with tag '\\address': 204
Total number of files kept after filtering: 930
Filtered output saved in: filtered_without_tags.txt


In [33]:
import re

# Define the input and output file paths
input_file_path = 'filtered_without_tags.txt'
output_file_path = 'filtered_without_authors.txt'

# Counter for the number of filtered and kept files
total_files_count_author = 0
filtered_files_count = 0
kept_files_count = 0

# Function to check if the content has an \author{} tag with line breaks
def contains_author_with_line_break(content):
    author_tag_pattern = re.compile(r'\\author\s*{.*?\\.*?}', re.S)
    return bool(author_tag_pattern.search(content))

# List to store files without the filtered \author{} tag content
files_without_author_tag = []

# Read the input file and process each content block
with open(input_file_path, 'r') as infile:
    content_blocks = infile.read().split('Content before abstract in ')

    for block in content_blocks:
        if block.strip():  # Ensure we are not processing an empty block
            total_files_count_author += 1
            lines = block.split('\n', 1)
            if len(lines) > 1:
                file_name = lines[0].strip().replace(':', '')
                content = lines[1].strip()

                # Check if the content has \author{} with \\ line breaks
                if contains_author_with_line_break(content):
                    filtered_files_count += 1
                else:
                    kept_files_count += 1
                    files_without_author_tag.append(f"Content before abstract in {file_name}:\n{content}\n")

# Write the filtered content to the output file
with open(output_file_path, 'w') as outfile:
    outfile.write('\n'.join(files_without_author_tag))

# Print statistics
print(f"Total number of files processed: {total_files_count}")
for tag, count in tag_counts.items():
    print(f"Number of papers with tag '{tag}': {count}")
# print(f"Total number of files processed: {total_files_count_author}")
print(f"Number of files with \\author{{}} containing line breaks: {filtered_files_count}")
print(f"Total number of files kept after filtering: {kept_files_count}")
print(f"Filtered output saved in: {output_file_path}")


308880

Total number of files processed: 2134
Number of papers with tag '\\affiliations': 107
Number of papers with tag '\\affiliation': 593
Number of papers with tag '\\icmlaffiliation': 54
Number of papers with tag '\\institute': 107
Number of papers with tag '\\affil': 125
Number of papers with tag '\\aff': 7
Number of papers with tag '\\AFF': 7
Number of papers with tag '\\address': 204
Number of files with \author{} containing line breaks: 786
Total number of files kept after filtering: 144
Filtered output saved in: filtered_without_authors.txt


Total number of files processed: 2134
Number of papers with tag '\affiliations': 107 (5%)
Number of papers with tag '\\affiliation': 593 (28%)
Number of papers with tag '\\icmlaffiliation': 54 (2.5%)
Number of papers with tag '\\institute': 107 (5%)
Number of papers with tag '\\affil': 125 (6%)
Number of papers with tag '\\aff': 7 (0.3%)
Number of papers with tag '\\AFF': 7 (0.3%)
Number of papers with tag '\\address': 204 (10%)
Number of files with \author{} containing line breaks: 786 (36%)
-- 1763 / 2134 (83%) paper used those tags for affiliation info --- 
Total number of files kept after filtering: 144 (7%)


\ARTICLEAUTHORS
\thanks

In [None]:
from typing import List, Tuple

def find_destination(height_map: List[List[int]], start_row: int, start_col: int) -> Tuple[int, int]:
    rows, cols = len(height_map), len(height_map[0])
    memo = {}  # Memoization table to store destinations for each cell

    def dfs(row: int, col: int) -> Tuple[int, int]:
        # If we have already computed the destination for this cell, return it
        if (row, col) in memo:
            return memo[(row, col)]
        
        # Initialize the current cell as the destination
        destination = (row, col)
        min_height = height_map[row][col]

        # Define directions: up, down, left, right
        directions = [(-1, 0), (1, 0), (0, -1), (0, 1)]

        # Explore each direction
        for dr, dc in directions:
            new_row, new_col = row + dr, col + dc
            if 0 <= new_row < rows and 0 <= new_col < cols:
                if height_map[new_row][new_col] < min_height:
                    # Recurse to the lower height cell
                    dest = dfs(new_row, new_col)
                    destination = dest
                    min_height = height_map[new_row][new_col]
        
        # Store result in memo
        memo[(row, col)] = destination
        return destination

    # Start DFS from the specified start cell
    return dfs(start_row, start_col)
