In [20]:
import pandas as pd

# URL s tabelo
url = "https://kt.ijs.si/~ljupco/lectures/papvp-2425/podatkovni-viri/List%20of%20Academy%20Award%e2%80%93winning%20foreign-language%20films%20-%20Wikipedia.html"

# Branje tabel iz URL-ja
tables = pd.read_html(url)
original_df = tables[0]

# Odstranimo [] in vsebino znotraj iz stolpcev 'Film title used in nomination' in 'Original title'
original_df['Film title used in nomination'] = original_df['Film title used in nomination'].str.replace(r'\[.*?\]', '', regex=True).str.strip()
original_df['Original title'] = original_df['Original title'].str.replace(r'\[.*?\]', '', regex=True).str.strip()

# Razširitev vrstic za kategorije, jezike in države
expanded_rows = []
for _, row in original_df.iterrows():
    # Razdelimo kategorije, ki vsebujejo več elementov
    categories = row['Category'].replace('\n', ' ').split() if isinstance(row['Category'], str) else [row['Category']]
    # Če je katera od kategorij kompleksna (npr. "Art Direction Cinematography"), jo razbijemo na posamezne dele
    detailed_categories = []
    for category in categories:
        detailed_categories.extend(category.split(" "))
    languages = row['Language(s)'].split("\n") if isinstance(row['Language(s)'], str) else [row['Language(s)']]
    countries = row['Country'].split("\n") if isinstance(row['Country'], str) else [row['Country']]
    for category in detailed_categories:
        for country in countries:
            for language in languages:
                expanded_rows.append({
                    'naslov_nom': row['Film title used in nomination'],
                    'naslov_orig': row['Original title'],
                    'leto': row['Year (Ceremony)'].split(" ")[0],  # Leto brez dodatka
                    'zap_st_podelitev': row['Year (Ceremony)'].split("(")[-1].replace(")", ""),  # Zaporedna številka podelitve
                    'kategorija': category.strip(),
                    'drzava': country.strip(),
                    'jezik': language.strip()
                })


# Pretvorimo razširjene podatke v DataFrame
final_df = pd.DataFrame(expanded_rows)

# Shranimo tabelo kot CSV
final_df.to_csv('koncna.csv', index=False)



In [25]:
import pandas as pd

# URL s tabelo
url = "https://kt.ijs.si/~ljupco/lectures/papvp-2425/podatkovni-viri/List%20of%20Academy%20Award%e2%80%93winning%20foreign-language%20films%20-%20Wikipedia.html"

# Branje tabel iz URL-ja
tables = pd.read_html(url)
original_df = tables[0]

# Odstranimo [] in vsebino znotraj iz stolpcev ter odstranimo oklepaje iz kategorij
original_df['Film title used in nomination'] = original_df['Film title used in nomination'].str.replace(r'\[.*?\]', '', regex=True).str.strip()
original_df['Original title'] = original_df['Original title'].str.replace(r'\[.*?\]', '', regex=True).str.strip()
original_df['Category'] = original_df['Category'].str.replace(r'\(.*?\)', '', regex=True).str.strip()
original_df['Year (Ceremony)'] = original_df['Year (Ceremony)'].str.replace(r'\(.*?\)', '', regex=True).str.strip()

# Dodamo nova stolpca za leto in zaporedno številko podelitve
original_df['leto'] = original_df['Year (Ceremony)'].str.extract(r'(\d{4})')  # Izvlečemo leto
original_df['zap_st_podelitev'] = original_df['Year (Ceremony)'].str.extract(r'(\d+)[^\d]*$')  # Zaporedna številka brez 'th' itd.

# Razširitev kategorij, držav in jezikov
categories_expanded = original_df['Category'].str.split('\n', expand=True).stack().reset_index(level=1, drop=True).to_frame(name='kategorija')
countries_expanded = original_df['Country'].str.split('\n', expand=True).stack().reset_index(level=1, drop=True).to_frame(name='drzava')
languages_expanded = original_df['Language(s)'].str.split(', ', expand=True).stack().reset_index(level=1, drop=True).to_frame(name='jezik')

# Združimo razširjene podatke z osnovnimi
expanded_df = (
    original_df[['Film title used in nomination', 'Original title', 'leto', 'zap_st_podelitev']]
    .join(categories_expanded)
    .join(countries_expanded)
    .join(languages_expanded)
)

# Preimenujemo stolpce, da ustrezajo tvoji strukturi
expanded_df.columns = ['naslov_nom', 'naslov_orig', 'leto', 'zap_st_podelitev', 'kategorija', 'drzava', 'jezik']

# Shranimo tabelo kot CSV
expanded_df.to_csv('koncna.csv', index=False)

print("Tabela je bila uspešno shranjena kot 'koncna.csv'.")


Tabela je bila uspešno shranjena kot 'koncna.csv'.


In [67]:
import pandas as pd

# URL strani z Wikipedije
url = 'https://kt.ijs.si/~ljupco/lectures/papvp-2425/podatkovni-viri/List%20of%20Academy%20Award%e2%80%93winning%20foreign-language%20films%20-%20Wikipedia.html'

# Preberite vse tabele s strani
tables = pd.read_html(url)

# Pridobimo prvo tabelo (predpostavljamo, da je to ta prava)
df = tables[0]

# Prilagodimo imena stolpcev glede na strukturo tabele
df.columns = ['leto', 'naslov_nom', 'naslov_orig', 'Category', 'opombe', 'Country', 'Language']

# Ustvarimo novo odprto tabelo za shranjevanje rezultatov
all_rows = []

# Preberemo vsako vrstico v izvorni tabeli
for _, row in df.iterrows():
    # Očistimo leto in zaporedno številko podelitve
    leto = str(row['leto']).split('(')[0].strip()
    zap_st_podelitev = ''
    if '(' in str(row['leto']):
        zap_st_podelitev = str(row['leto']).split('(')[1].replace('th)', '').replace('st)', '').replace('nd)', '').replace('rd)', '').strip()

    # Očistimo naslov nominiranega filma in originalni naslov
    naslov_nom = pd.Series(str(row['naslov_nom'])).str.replace(r'\[.*?\]', '', regex=True).str.replace(r'\(.*?\)', '', regex=True).iloc[0].strip()
    naslov_orig = pd.Series(str(row['naslov_orig'])).str.replace(r'\[.*?\]', '', regex=True).str.replace(r'\(.*?\)', '', regex=True).iloc[0].strip()

    # Odstranimo oklepaje in vsebino iz kategorij
    categories_raw = str(row['Category'])
    categories_cleaned = pd.Series(categories_raw).str.replace(r'\(.*?\)', '', regex=True).iloc[0]
    categories = [cat.strip() for cat in categories_cleaned.split(',') if cat.strip()]

    # Razdelimo jezike
    languages = [lang.strip() for lang in str(row['Language']).split(',') if lang.strip()]

    # Preverimo, če so podatki o državi pravilno razdeljeni
    drzava = str(row['Country']).strip()

    # Ustvarimo nov zapis za vsako kombinacijo kategorije in jezika
    for category in categories:
        for lang in languages:
            all_rows.append({
                'naslov_nom': naslov_nom,
                'naslov_orig': naslov_orig,
                'leto': leto,
                'zap_st_podelitev': zap_st_podelitev,
                'kategorija': category,
                'drzava': drzava,
                'jezik': lang
            })

# Ustvarimo nov DataFrame iz sestavljenih vrstic
final_df = pd.DataFrame(all_rows)

# Shranite tabelo v CSV datoteko
final_df.to_csv('academy_award_foreign_language_films.csv', index=False, encoding='utf-8-sig')

print("Tabela je bila shranjena v 'academy_award_foreign_language_films.csv'")


Tabela je bila shranjena v 'academy_award_foreign_language_films.csv'


In [96]:
import pandas as pd

# URL strani z Wikipedije
url = 'https://kt.ijs.si/~ljupco/lectures/papvp-2425/podatkovni-viri/List%20of%20Academy%20Award%e2%80%93winning%20foreign-language%20films%20-%20Wikipedia.html'

# Preberite vse tabele s strani
tables = pd.read_html(url)

# Pridobimo prvo tabelo (predpostavljamo, da je to ta prava)
df = tables[0]

# Prilagodimo imena stolpcev glede na strukturo tabele
df.columns = ['leto', 'naslov_nom', 'naslov_orig', 'Category', 'opombe', 'Country', 'Language']

# Tabela vseh kategorij za boljšo organizacijo kategorij
categories_master_list = [
    "Actor in a Leading Role",
    "Actress in a Leading Role",
    "Art Direction",
    "Cinematography",
    "Costume Design",
    "Documentary Feature",
    "Film Editing",
    "Makeup",
    "Music",
    "Original Screenplay",
    "Picture",
    "Production Design",
    "Sound Editing",
    "Visual Effects",
    "Writing",
    "Animated Feature"
]

# Definiramo funkcijo za dodajanje vrstic za kategorije, jezike in države
def expand_rows(row):
    leto = str(row['leto']).split('(')[0].strip()
    zap_st_podelitev = ''
    if '(' in str(row['leto']):
        zap_st_podelitev = str(row['leto']).split('(')[1].replace('th)', '').replace('st)', '').replace('nd)', '').replace('rd)', '').strip()

    naslov_nom = pd.Series(str(row['naslov_nom'])).str.replace(r'\[.*?\]', '', regex=True).str.replace(r'\(.*?\)', '', regex=True).iloc[0].strip()
    naslov_orig = pd.Series(str(row['naslov_orig'])).str.replace(r'\[.*?\]', '', regex=True).str.replace(r'\(.*?\)', '', regex=True).iloc[0].strip()

    categories_raw = str(row['Category'])
    categories_cleaned = pd.Series(categories_raw).str.replace(r'\(.*?\)', '', regex=True).iloc[0]
    categories_split = [cat.strip() for cat in categories_cleaned.split() if cat.strip()]

    # Razdeli združene kategorije v posamezne enote
    corrected_categories = []
    temp = []
    for cat in categories_split:
        if cat in categories_master_list:
            corrected_categories.append(cat)
        elif temp:
            temp.append(cat)
            full_category = " ".join(temp)
            if full_category in categories_master_list:
                corrected_categories.append(full_category)
            temp = []
        else:
            temp.append(cat)

    languages = [lang.strip() for lang in str(row['Language']).split(',') if lang.strip()]
    countries = [country.strip() for country in str(row['Country']).split(',') if country.strip()]

    rows = []
    for category in corrected_categories:
        for lang in languages:
            for country in countries:
                rows.append({
                    'naslov_nom': naslov_nom,
                    'naslov_orig': naslov_orig,
                    'leto': leto,
                    'zap_st_podelitev': zap_st_podelitev,
                    'kategorija': category,
                    'drzava': country,
                    'jezik': lang
                })
    return rows

# Razširimo vrstice
detailed_rows = []
for _, row in df.iterrows():
    detailed_rows.extend(expand_rows(row))

# Ustvarimo DataFrame z razširjenimi vrsticami
final_df = pd.DataFrame(detailed_rows)

# Shranimo tabelo v CSV datoteko
final_df.to_csv('academy_award_foreign_language_films.csv', index=False, encoding='utf-8-sig')

print("Tabela je bila shranjena v 'academy_award_foreign_language_films.csv'")


Tabela je bila shranjena v 'academy_award_foreign_language_films.csv'


In [104]:
import pandas as pd

# URL strani z Wikipedije
url = 'https://kt.ijs.si/~ljupco/lectures/papvp-2425/podatkovni-viri/List%20of%20Academy%20Award%e2%80%93winning%20foreign-language%20films%20-%20Wikipedia.html'

# Preberite vse tabele s strani
tables = pd.read_html(url)

# Pridobimo prvo tabelo (predpostavljamo, da je to ta prava)
df = tables[0]

# Prilagodimo imena stolpcev glede na strukturo tabele
df.columns = ['leto', 'naslov_nom', 'naslov_orig', 'Category', 'opombe', 'Country', 'Language']

# Tabela vseh kategorij za boljšo organizacijo kategorij
categories_master_list = [
    "Actor in a Leading Role",
    "Actress in a Leading Role",
    "Art Direction",
    "Cinematography",
    "Costume Design",
    "Documentary Feature",
    "Film Editing",
    "Makeup",
    "Music",
    "Original Screenplay",
    "Picture",
    "Production Design",
    "Sound Editing",
    "Visual Effects",
    "Writing",
    "Animated Feature"
]

# Funkcija za razširitev vrstic za kategorije, jezike in države

def expand_rows(row):
    leto = str(row['leto']).split('(')[0].strip()
    zap_st_podelitev = ''
    if '(' in str(row['leto']):
        zap_st_podelitev = str(row['leto']).split('(')[1].replace('th)', '').replace('st)', '').replace('nd)', '').replace('rd)', '').strip()

    naslov_nom = pd.Series(str(row['naslov_nom'])).str.replace(r'\[.*?\]', '', regex=True).str.replace(r'\(.*?\)', '', regex=True).iloc[0].strip()
    naslov_orig = pd.Series(str(row['naslov_orig'])).str.replace(r'\[.*?\]', '', regex=True).str.replace(r'\(.*?\)', '', regex=True).iloc[0].strip()

    categories_raw = str(row['Category'])
    categories_cleaned = pd.Series(categories_raw).str.replace(r'\(.*?\)', '', regex=True).iloc[0]
    categories_split = categories_cleaned.split(',')

    # Popravi kategorije glede na master seznam
    corrected_categories = []
    temp = []
    for cat in categories_split:
        cat = cat.strip()
        if cat in categories_master_list:
            corrected_categories.append(cat)
        else:
            temp.append(cat)
            full_category = " ".join(temp)
            if full_category in categories_master_list:
                corrected_categories.append(full_category)
                temp = []

    languages = [lang.strip() for lang in str(row['Language']).split(',') if lang.strip()]
    countries = [country.strip() for country in str(row['Country']).split(',') if country.strip()]

    if not countries:  # Če ni držav, dodaj "Unknown"
        countries = ["Unknown"]

    rows = []
    for category in corrected_categories:
        for country in countries:
            for lang in languages:
                rows.append({
                    'naslov_nom': naslov_nom,
                    'naslov_orig': naslov_orig,
                    'leto': leto,
                    'zap_st_podelitev': zap_st_podelitev,
                    'kategorija': category,
                    'drzava': country,
                    'jezik': lang
                })
    return rows

# Razširimo vrstice
detailed_rows = []
for _, row in df.iterrows():
    detailed_rows.extend(expand_rows(row))

# Preverimo število vrstic glede na kategorije, države in jezike
expanded_df = pd.DataFrame(detailed_rows)

# Shranimo tabelo v CSV datoteko
expanded_df.to_csv('academy_award_foreign_language_films.csv', index=False, encoding='utf-8-sig')

print("Tabela je bila shranjena v 'academy_award_foreign_language_films.csv'")


Tabela je bila shranjena v 'academy_award_foreign_language_films.csv'


In [103]:
import pandas as pd

# URL strani z Wikipedije
url = 'https://kt.ijs.si/~ljupco/lectures/papvp-2425/podatkovni-viri/List%20of%20Academy%20Award%e2%80%93winning%20foreign-language%20films%20-%20Wikipedia.html'

# Preberite vse tabele s strani
tabele = pd.read_html(url)

# Pridobimo prvo tabelo (predpostavljamo, da je to ta prava)
tabela = tables[0]

def urejanje(row):
    leto = str(row['leto']).split('(')[0].strip()
    zap_st_podelitev = ''
    if '(' in str(row['leto']):
        zap_st_podelitev = str(row['leto']).split('(')[1].replace('th)', '').replace('st)', '').replace('nd)', '').replace('rd)', '').strip()

    naslov_nom = pd.Series(str(row['naslov_nom'])).str.replace(r'\[.*?\]', '', regex=True).str.replace(r'\(.*?\)', '', regex=True).iloc[0].strip()
    naslov_orig = pd.Series(str(row['naslov_orig'])).str.replace(r'\[.*?\]', '', regex=True).str.replace(r'\(.*?\)', '', regex=True).iloc[0].strip()



print(tabela)


           leto                            naslov_nom  \
0   1945 (18th)                          Marie-Louise   
1   1954 (27th)                       Gate of Hell[A]   
2   1956 (29th)                       The Red Balloon   
3   1959 (32nd)               Serengeti Shall Not Die   
4   1960 (33rd)                       Never on Sunday   
5   1961 (34th)                         La Dolce Vita   
6   1962 (35th)                 Divorce Italian Style   
7   1963 (36th)                                 8½[B]   
8   1966 (39th)                  A Man and a Woman[B]   
9   1969 (42nd)                                  Z[B]   
10  1973 (46th)                    Cries and Whispers   
11  1983 (56th)                Fanny and Alexander[B]   
12  1985 (58th)                                   Ran   
13  1990 (63rd)                    Cyrano de Bergerac   
14  1995 (68th)                           The Postman   
15  1998 (71st)                  Life Is Beautiful[B]   
16  2000 (73rd)  Crouching Tige