In [1]:
!pip install anytree

[0mCollecting anytree
  Downloading anytree-2.12.1-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.9/44.9 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: anytree
Successfully installed anytree-2.12.1
[0m

In [2]:
import pandas as pd
from anytree import Node, RenderTree, AsciiStyle, PreOrderIter
from copy import deepcopy

In [3]:
# Definir el número máximo de niveles
X = 2

# Leer el archivo CSV
df = pd.read_csv('FinalDataset/DBpediaClasses.csv')

In [4]:
# Crear un diccionario para almacenar los nodos
nodes = {}

# Construir los nodos y el árbol
for _, row in df.iterrows():
    parent_name = row['SubClass']
    child_name = row['Class']
    
    # Crear el nodo padre si no existe
    if parent_name not in nodes:
        nodes[parent_name] = Node(parent_name)
    
    # Crear el nodo hijo y asignarlo al nodo padre
    if child_name not in nodes:
        nodes[child_name] = Node(child_name, parent=nodes[parent_name])
    else:
        nodes[child_name].parent = nodes[parent_name]

root_nodes = [node for node in nodes.values() if node.is_root]

In [5]:
# Mostrar el árbol
for root_node in root_nodes:
    for pre, fill, node in RenderTree(root_node, style=AsciiStyle()):
        print(f"{pre}{node.name}")

root
|-- Activity
|   |-- Game
|   |   |-- BoardGame
|   |   +-- CardGame
|   |-- Sales
|   +-- Sport
|       |-- Athletics
|       |-- Boxing
|       |   |-- BoxingCategory
|       |   +-- BoxingStyle
|       +-- HorseRiding
|-- Agent
|   |-- Deity
|   |-- Employer
|   |-- Family
|   |   +-- NobleFamily
|   |-- Organisation
|   |   |-- Broadcaster
|   |   |   |-- BroadcastNetwork
|   |   |   |-- RadioStation
|   |   |   +-- TelevisionStation
|   |   |-- Company
|   |   |   |-- Airline
|   |   |   |-- Bank
|   |   |   |-- Brewery
|   |   |   |-- BusCompany
|   |   |   |-- Caterer
|   |   |   |-- LawFirm
|   |   |   |-- Publisher
|   |   |   |-- RecordLabel
|   |   |   +-- Winery
|   |   |-- EducationalInstitution
|   |   |   |-- College
|   |   |   |-- Library
|   |   |   |-- School
|   |   |   +-- University
|   |   |-- EmployersOrganisation
|   |   |-- GeopoliticalOrganisation
|   |   |-- GovernmentAgency
|   |   |-- Group
|   |   |   |-- Band
|   |   |   +-- ComedyGroup
|   |   |-- 

In [6]:
root_nodes_copy = deepcopy(root_nodes)

In [7]:
def prune_tree(node, max_depth, current_depth=1):
    # Si estamos al nivel máximo, eliminar todos los hijos
    if current_depth >= max_depth:
        node.children = []
    else:
        # Recursivamente podar cada hijo
        for child in node.children:
            prune_tree(child, max_depth, current_depth + 1)

# Podar el árbol a X niveles
for root in root_nodes_copy:
    prune_tree(root, X)

# Mostrar el árbol podado
for pre, fill, node in RenderTree(root_nodes_copy[0]):
    print("%s%s" % (pre, node.name))

root
├── Activity
├── Agent
├── Altitude
├── AnatomicalStructure
├── Area
├── Award
├── Biomolecule
├── Blazon
├── ChartsPlacements
├── ChemicalSubstance
├── Colour
├── Currency
├── Demographics
├── Depth
├── Device
├── Diploma
├── Disease
├── ElectionDiagram
├── EthnicGroup
├── Event
├── Flag
├── Food
├── GeneLocation
├── GrossDomesticProduct
├── GrossDomesticProductPerCapita
├── Holiday
├── HumanDevelopmentIndex
├── Language
├── List
├── MeanOfTransportation
├── Media
├── Medicine
├── Name
├── PenaltyShootOut
├── PersonFunction
├── Place
├── Polyhedron
├── Population
├── PublicService
├── RouteStop
├── Species
├── SportCompetitionResult
├── SportsSeason
├── Statistic
├── TimePeriod
├── TopicalConcept
├── UnitOfWork
├── Unknown
└── Work


In [8]:
df_new = pd.read_csv('FinalDataset/final_dataset.csv')

In [9]:
def get_new_class(node, podado_root):
    current_node = node
    while current_node and not any(p.name == current_node.name for p in podado_root.descendants):
        current_node = current_node.parent
    return current_node.name if current_node else None

nodes = {node.name: node for node in root_nodes[0].descendants}
nodes[root.name] = root

df_new['NewClass'] = df_new['Class'].apply(lambda x: get_new_class(nodes[x], root_nodes_copy[0]))
num_labels = num_unique_labels = df_new['NewClass'].nunique()

In [10]:
print(df_new)

                       Class                Subclass  \
0               Organisation                   Agent   
1       AdministrativeRegion                  Region   
2              OlympicResult  SportCompetitionResult   
3                  President              Politician   
4                 SoccerClub              SportsTeam   
...                      ...                     ...   
785745  AdministrativeRegion                  Region   
785746               Athlete                  Person   
785747  AdministrativeRegion                  Region   
785748             VideoGame                Software   
785749                Person                   Agent   

                                                     Text  \
0       Estudios Unísono es el nombre de un estudio de...   
1       Hartford es una ciudad ubicada en el condado d...   
2       Irak participa en los Juegos Olímpicos de 2016...   
3       Vicente Torrijos Rivera (Florencia, Caquetá, 1...   
4       El Santutxu Fú

In [11]:
df_new.to_csv('FinalDataset/polished_dataset_'+str(num_labels)+'.csv', index=False)