In [1]:
import xml.etree.ElementTree as ET
import pandas as pd

# Parse the XML file
tree = ET.parse('azwiki-latest-pages-articles.xml')
root = tree.getroot()

# Define namespaces (if needed)
namespaces = {'': 'http://www.mediawiki.org/xml/export-0.11/'}

# Initialize lists to store data
data = []

# Iterate through each page in the XML
for page in root.findall('page', namespaces):
    page_id = page.find('id', namespaces).text
    title = page.find('title', namespaces).text
    ns = page.find('ns', namespaces).text
    data.append({'id': page_id, 'title': title, 'ns': ns})

# Convert to DataFrame
df = pd.DataFrame(data)

# Save DataFrame to CSV with UTF-8-SIG encoding
df.to_csv('pages.csv', encoding='utf-8-sig', index=False)


In [2]:
pages = pd.read_csv('pages.csv', encoding='utf-8-sig')

# Display the first few rows of the DataFrame
print("Head of the DataFrame:")
print(pages.head())

# Display the number of rows and columns
print("\nShape of the DataFrame:")
print(pages.shape)

# Display basic information about the DataFrame
print("\nBasic Information:")
print(pages.info())

# Display summary statistics (if applicable)
print("\nSummary Statistics:")
print(pages.describe(include='all'))

Head of the DataFrame:
   id                     title  ns
0   1                Ana səhifə   0
1   2                  HomePage   0
2  54     MediaViki:Subjectpage   8
3  56   MediaViki:Wikipediapage   8
4  60  MediaViki:Redirectedfrom   8

Shape of the DataFrame:
(457518, 3)

Basic Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 457518 entries, 0 to 457517
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      457518 non-null  int64 
 1   title   457518 non-null  object
 2   ns      457518 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 10.5+ MB
None

Summary Statistics:
                   id       title             ns
count   457518.000000      457518  457518.000000
unique            NaN      457518            NaN
top               NaN  Ana səhifə            NaN
freq              NaN           1            NaN
mean    494482.439537         NaN       5.868049
std     250160.925556         NaN     

In [4]:
# Load the CSV file into a DataFrame
df = pd.read_csv('pages.csv', encoding='utf-8-sig')

# Filter the DataFrame to keep only rows with namespaces 0, 14, and 100
filtered_df = df[df['ns'].astype(int).isin([0, 14, 100])]

# Get the count of the remaining rows
row_count = filtered_df.shape[0]

# Display the filtered DataFrame and row count
filtered_df.head()


Unnamed: 0,id,title,ns
0,1,Ana səhifə,0
1,2,HomePage,0
35,604,İnternet,0
37,610,Tarix,0
38,611,Dağlıq Qarabağ,0


In [5]:
row_count


404751

In [8]:
filtered_df.to_csv('pages_complete.csv', encoding='utf-8-sig', index=False)

In [17]:
pages_df = pd.read_csv('pages_complete.csv', encoding='utf-8-sig')
pagelinks_df = pd.read_csv('pagelinks.csv', encoding='utf-8-sig')
print(str(pages_df.shape[0]) + ' filtered pages')
print(str(pagelinks_df.shape[0]) + ' unfiltered links')

404751 filtered pages
10599277 unfiltered links


In [21]:
# Extract the valid IDs from pages_df
valid_ids = set(pages_df['id'].astype(int))

# Check if link_from or link_to contains invalid IDs
valid_pagelinks_df = pagelinks_df[
    pagelinks_df['link_from'].astype(int).isin(valid_ids) &
    pagelinks_df['link_to'].astype(int).isin(valid_ids)
]

# Display the count of the remaining rows
final_row_count = valid_pagelinks_df.shape[0]
final_row_count

4082783

In [22]:
valid_pagelinks_df.to_csv('pagelinks_complete.csv', encoding='utf-8-sig', index=False)