In [38]:
import pandas
import numpy as np
pandas.set_option('display.max_rows', 200)

In [61]:
class Concepts:
    FILM = "Q11424" 
    CITY = "Q515"
    VIDEO_GAME = "Q7889"
    
    """
    "city": d("Q2095"),
    "food": d("Q515"),
    "human": d("Q5"),
    "country": d("Q6256"),
    "year": d("Q577"),
    "tourist attraction": d("Q570116"),
    "archaeological site": d("Q839954"),
    "temple": d("Q44539"),
    "job": d("Q192581"),
    "higher_education": d("Q38723"),
    "anime film": d("Q20650540"),
    "film": d("Q11424"),
    "building": d("Q41176"),
    "mountain": d("Q8502"),
    "trail": d("Q628179"),
    "event": d("Q1656682"),
    "television series": d("Q5398426"),
    "website": d("Q35127"),
    "language": d("Q34770"),
    "human-geographic": d("Q15642541"),
    "political-territorial-entity": d("Q1048835"),
    """
    
    
def load_data(datapath="data/wikilanguage.tsv", usecols=None):
    headers = next(open(datapath)).split()
    wikis = set(e.split("_")[0] for e in headers if "wiki_" in e)
    dataset = pandas.read_csv(
        "data/wikilanguage.tsv",
        sep="\t",
        index_col="concept_id",
        usecols=usecols,
        dtype={
            "concept_id": "str",
            "concept_instances": "str",
            "concept_direct_instances": "str",
            **{
                f"{wiki}_title": "unicode"
                for wiki in wikis
            },
            **{
                f"{wiki}_pagerank": "float64"
                for wiki in wikis
            },
        },
    )

    for wiki in wikis:
        if usecols is None or f"{wiki}_pagerank" in usecols:
            dataset[f"{wiki}_pagerank"] /= dataset[f"{wiki}_pagerank"].sum()
        
    return dataset

def best_concepts(dataset, sample=0.1, n=200):
    instances_of = dataset['concept_instances'].sample(frac=sample).str.split(",").explode()
    prob_mass = pandas.concat(
        (instances_of, dataset.loc[instances_of.index]['enwiki_pagerank']), 
        axis=1, copy=False
    ).groupby(['concept_instances']).sum().nlargest(n, "enwiki_pagerank")
    ret = pandas.concat((dataset.loc[prob_mass.index]['enwiki_title'], prob_mass), axis=1, copy=False).dropna()
    return ret

def instance_of(dataset, instance_of_id):
    return dataset[dataset['concept_instances'].str.contains(f"{instance_of_id}[,$]", na=False)]

def top_ranked(dataset, wiki, n=200):
    ret = dataset[dataset[f"{wiki}_title"].notna()][["enwiki_title", f"{wiki}_title", f"{wiki}_pagerank"]]
    ret[f"{wiki}_relative_to_max"] = ret[f"{wiki}_pagerank"] / ret[f"{wiki}_pagerank"].max()
    return ret.nlargest(n, f"{wiki}_pagerank")

def top_kl(dataset, base_wiki, target_wiki, n=200, marginals=False, importance_weight=1):
    matching_rows = dataset[dataset[f"{base_wiki}_title"].notna() & dataset[f"{target_wiki}_title"].notna()].copy()
    if marginals:
        matching_rows[f"{base_wiki}_pagerank"] /= matching_rows[f"{base_wiki}_pagerank"].sum()
        matching_rows[f"{target_wiki}_pagerank"] /= matching_rows[f"{target_wiki}_pagerank"].sum()
    matching_rows["kl_divergence"] = (
        (matching_rows[f"{target_wiki}_pagerank"] ** importance_weight) * 
        (np.log(matching_rows[f"{target_wiki}_pagerank"]) - np.log(matching_rows[f"{base_wiki}_pagerank"]))
    )
    matching_rows["odds_ratio"] = matching_rows[f"{target_wiki}_pagerank"] / matching_rows[f"{base_wiki}_pagerank"]
    ret = matching_rows[[f"{base_wiki}_title", f"{target_wiki}_title", "kl_divergence", "odds_ratio"]]
    ret["kl_relative_to_max"] = ret["kl_divergence"] / ret["kl_divergence"].max()
    
    return ret.nlargest(n, "kl_divergence")

In [3]:
dataset = load_data()

In [None]:
# Memory Hungry Function
top_concepts = best_concepts(
    dataset, sample=0.1, n=500
)

In [66]:
films = instance_of(dataset, Concepts.FILM).drop(
    index=[
        "Q2537294", # All-Russia State Television and Radio Broadcas
        "Q11659568", # Minami no Teiō 
    ]
)

In [67]:
games = instance_of(dataset, Concepts.VIDEO_GAME).drop(
    index=[
        
    ]
)

In [53]:
top_ranked(games, "frwiki")

Unnamed: 0_level_0,enwiki_title,frwiki_title,frwiki_pagerank,frwiki_relative_to_max
concept_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Q216293,Pong,Pong,4.444338e-06,1.0
Q49740,Minecraft,Minecraft,4.045992e-06,0.91037
Q131007,World of Warcraft,World of Warcraft,3.577368e-06,0.804927
Q173626,Pac-Man,Pac-Man,2.773629e-06,0.624081
Q94797,Grand Theft Auto IV,Grand Theft Auto IV,2.747062e-06,0.618104
Q71910,Tetris,Tetris,2.611334e-06,0.587564
Q11168,Super Mario Bros.,Super Mario Bros.,2.528707e-06,0.568973
Q853499,Forgotten Realms,Royaumes oubliés,2.520059e-06,0.567027
Q189784,Doom (1993 video game),Doom,2.383335e-06,0.536263
Q17452,Grand Theft Auto V,Grand Theft Auto V,2.181328e-06,0.490811


In [71]:
marginals = top_kl(films, "enwiki", "jawiki", marginals=True, importance_weight=1)["enwiki_title"].reset_index()
no_marginals = top_kl(films, "enwiki", "jawiki", marginals=True, importance_weight=2)["enwiki_title"].reset_index()
pandas.concat((marginals, no_marginals), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,concept_id,enwiki_title,concept_id.1,enwiki_title.1
0,Q1060124,Castle of Sand,Q698745,Mobile Suit Zeta Gundam
1,Q698745,Mobile Suit Zeta Gundam,Q1060124,Castle of Sand
2,Q3235913,Three Outlaw Samurai,Q103474,2001: A Space Odyssey (film)
3,Q21072514,High & Low: The Movie,Q155653,Spirited Away
4,Q18729861,Maku ga Agaru,Q3235913,Three Outlaw Samurai
5,Q7250018,Proof of the Man,Q21072514,High & Low: The Movie
6,Q21697406,Your Name,Q18729861,Maku ga Agaru
7,Q202681,Joint Security Area (film),Q21697406,Your Name
8,Q7637466,Summer Time Machine Blues,Q536299,Shaolin Soccer
9,Q1459428,Virus (1980 film),Q318766,The Terminal
