### Hi
see https://github.com/sciunto-org/python-bibtexparser

In [20]:
# ! pip install --no-cache-dir --force-reinstall git+https://github.com/sciunto-org/python-bibtexparser@main
import pandas as pd
import bibtexparser

with open("./malpedia.bib", 'r', encoding='utf-8') as f:
    bibtex_str = f.read()
library = bibtexparser.parse_string(bibtex_str)

In [3]:
print(f"Parsed {len(library.blocks)} blocks, including:"
  f"\n\t{len(library.entries)} entries"
    f"\n\t{len(library.comments)} comments"
    f"\n\t{len(library.strings)} strings and"
    f"\n\t{len(library.preambles)} preambles")

Parsed 13901 blocks, including:
	13901 entries
	0 comments
	0 strings and
	0 preambles


In [3]:
first_entry = library.entries[0]
# first_entry.key # The entry key
# first_entry.entry_type # The entry type, e.g. "article"
# first_entry.fields # The entry fields (e.g. author, title, etc. with their values)
# first_entry.fields_dict # The entry fields, as a dictionary by field key

In [4]:
first_entry.fields

[Field(key=`author`, value=`@037`, start_line=1),
 Field(key=`title`, value=`{APT38 DYEPACK FRAMEWORK}`, start_line=2),
 Field(key=`date`, value=`2019-03-20`, start_line=3),
 Field(key=`organization`, value=`Github (649)`, start_line=4),
 Field(key=`url`, value=`https://github.com/649/APT38-DYEPACK`, start_line=5),
 Field(key=`language`, value=`English`, start_line=6),
 Field(key=`urldate`, value=`2019-12-17`, start_line=7)]

In [5]:
first_entry.fields_dict

{'author': Field(key=`author`, value=`@037`, start_line=1),
 'title': Field(key=`title`, value=`{APT38 DYEPACK FRAMEWORK}`, start_line=2),
 'date': Field(key=`date`, value=`2019-03-20`, start_line=3),
 'organization': Field(key=`organization`, value=`Github (649)`, start_line=4),
 'url': Field(key=`url`, value=`https://github.com/649/APT38-DYEPACK`, start_line=5),
 'language': Field(key=`language`, value=`English`, start_line=6),
 'urldate': Field(key=`urldate`, value=`2019-12-17`, start_line=7)}

In [7]:
first_entry.fields_dict['url'].value

'https://github.com/649/APT38-DYEPACK'

In [7]:
url_list = set()
url_none_count = 0
for entry in library.entries:
    url = entry.fields_dict['url'].value
    if not url or url == '':
        url_none_count += 1
    url_list.add(url)

print(len(url_list))
print(f"url_none_count: {url_none_count}")

13894
url_none_count: 0


In [10]:
seen = set()
dupes = []

for entry in library.entries:
    url = entry.fields_dict['url'].value
    if url in seen:
        dupes.append(url)
    else:
        seen.add(url)

In [11]:
dupes

['https://i.blackhat.com/asia-21/Thursday-Handouts/as-21-Ding-Domain-Borrowing-Catch-My-C2-Traffic-If-You-Can.pdf',
 'https://www.intezer.com/blog/malware-analysis/habitsrat-used-to-target-linux-and-windows-servers/',
 'https://www.boho.or.kr/filedownload.do?attach_file_seq=2652&attach_file_id=EpF2652.pdf',
 'https://malgamy.github.io/malware-analysis/Deep-Analysis-Agent-Tesla/',
 'https://www.bitdefender.com/files/News/CaseStudies/study/415/Bitdefender-PR-Whitepaper-RedLine-creat6109-en-EN.pdf',
 'https://www.youtube.com/watch?v=T5wPwvLrBYU',
 'https://research.checkpoint.com/2021/pixstealer-a-new-wave-of-android-banking-trojans-abusing-accessibility-services/']

In [17]:
as21 = []
for entry in library.entries:
    url = entry.fields_dict['url'].value
    if url == 'https://www.youtube.com/watch?v=T5wPwvLrBYU':
        print(entry.entry_type)
        as21.append(entry.fields_dict)

as21

online
online


[{'author': Field(key=`author`, value=`Kaspersky`, start_line=60780),
  'title': Field(key=`title`, value=`{Video: Operation ShadowHammer: Costin Raiu and Vitaly Kamlyuk at #TheSAS2019}`, start_line=60781),
  'date': Field(key=`date`, value=`2019-05-20`, start_line=60782),
  'organization': Field(key=`organization`, value=`YouTube`, start_line=60783),
  'url': Field(key=`url`, value=`https://www.youtube.com/watch?v=T5wPwvLrBYU`, start_line=60784),
  'language': Field(key=`language`, value=`English`, start_line=60785),
  'urldate': Field(key=`urldate`, value=`2020-01-08`, start_line=60786)},
 {'author': Field(key=`author`, value=`Costin Raiu and Vitaly Kamluk`, start_line=94298),
  'title': Field(key=`title`, value=`{Operation ShadowHammer: Costin Raiu and Vitaly Kamlyuk at #TheSAS2019}`, start_line=94299),
  'date': Field(key=`date`, value=`2019-05-20`, start_line=94300),
  'organization': Field(key=`organization`, value=`Youtube (Kaspersky)`, start_line=94301),
  'url': Field(key=`url

In [25]:
class Report:
    def __init__(self) -> None:
        self.platform = None  # 來自哪個平台，應是 Malpedia
        self.media_type = None
        self.url = None
        self.date = None
        self.organization = None
        self.title = None
        self.author = None

    def set_all_attr(self, media_type, url, date, organization, title, author) -> None:
        self.media_type = media_type
        self.url = url
        self.date = date
        self.organization = organization
        self.title = title
        self.author = author

    def json(self) -> dict:
        return {
            'media_type': self.media_type,
            'url': self.url,
            'date': self.date,
            'organization': self.organization,
            'title': self.title,
            'author': self.author,
        }
    
seen = set()
report_list:list[Report] = []
for entry in library.entries:
    url = entry.fields_dict['url'].value
    if url in seen:
        continue
    else:
        seen.add(url)
    media_type = entry.entry_type
    date   = entry.fields_dict['date'].value
    try:
        organization = entry.fields_dict['organization'].value
    except:
        organization = None
    title:str  = entry.fields_dict['title'].value
    author = entry.fields_dict['author'].value
    title = title.replace('{','')
    title = title.replace('}','')
    
    report = Report()
    report.set_all_attr(media_type, url, date, organization, title, author)
    report_list.append(report)

report_json_list = [r.json() for r in report_list]

In [26]:
df = pd.DataFrame(report_json_list)
df.to_csv('./malpedia.csv', index=False)
df.head(3)

Unnamed: 0,media_type,url,date,organization,title,author
0,online,https://github.com/649/APT38-DYEPACK,2019-03-20,Github (649),APT38 DYEPACK FRAMEWORK,@037
1,online,https://blueteamblog.com/microsoft-exchange-ze...,2021-03-06,Blue Team Blog,Microsoft Exchange Zero Day’s – Mitigations an...,Auth 0r
2,online,https://blueteamblog.com/darkside-ransomware-o...,2021-05-14,Blue Team Blog,DarkSide Ransomware Operations – Preventions a...,Auth 0r


In [27]:
len(df)

13894