In [1]:
import pandas as pd
import numpy as np
import unicodedata
import difflib
import unidecode

In [2]:
platforms = ["Guardian", "NME1", "Pitchfork", "Spectrum", "DIY", "LAQ", "Gigwise", "Ohm", "Skinny", "Uncut", "UTR"]
paths = [f"C:\\Users\\tommy\\OneDrive\\University\\Year 3\\Third Year Project\\Platform Album Data\\{platform}.csv" for platform in platforms]

In [3]:
dfs = {platform: pd.read_csv(path, index_col=0) for platform, path in zip(platforms, paths)}

In [4]:
{platform: df.shape[0] for platform, df in dfs.items()}

{'Guardian': 183479,
 'NME1': 5052,
 'Pitchfork': 25396,
 'Spectrum': 7933,
 'DIY': 2234,
 'LAQ': 2153,
 'Gigwise': 1922,
 'Ohm': 1488,
 'Skinny': 3734,
 'Uncut': 10969,
 'UTR': 4200}

In [5]:
dfs = {platform: df.drop_duplicates() for platform, df in dfs.items()}

for platform, df in dfs.items():
    
    # df.drop(columns=["Platform"], axis=1, inplace=True)
    df.loc[:, "Album"] = df["Album"].apply(lambda x: unidecode.unidecode(unicodedata.normalize("NFKD", str(x))))
    df.loc[:, "Artist"] = df["Artist"].apply(lambda x: unidecode.unidecode(unicodedata.normalize("NFKD", str(x))))
    df.loc[:, "Album"] = df["Album"].apply(lambda x: x.strip("'`").replace(".", "").replace(",", "").replace("–", "").replace("-", "").replace(
        "&", "And").lower().replace("' review", "").strip())
    df.loc[:, "Artist"] = df["Artist"].apply(lambda x: x.strip("'`").replace(".", "").replace(",", "").replace("–", "").replace("-", "").replace(
        "&", "And").replace("album review:", "").lower().strip())
    df.rename({"Url": f"{platform}_url"}, axis=1, inplace=True)
    dfs[platform] = df


In [35]:
platforms = list(dfs.keys())
p1 = []
p2 = []
intersect = []

for i in range(len(dfs)):
    for j in range(len(dfs)):
        platform1 = platforms[i]
        platform2 = platforms[j]
        n = pd.merge(dfs[platform1], dfs[platform2], how="inner", on=["Album", "Artist"]).shape[0]
        p1.append(platform1)
        p2.append(platform2)
        intersect.append(n)

data = pd.DataFrame({"Platform1": p1, "Platform2": p2, "n": intersect})
data.loc[data["Platform1"] == data["Platform2"]] = None
data.loc[data["Platform1"] == "NME1", "Platform1"] = "NME"
data.loc[data["Platform2"] == "NME1", "Platform2"] = "NME"
        
pd.crosstab(data.loc[:, "Platform1"], data.loc[:, "Platform2"], intersect, aggfunc=sum).style.background_gradient(axis=None).format(precision=0)

Platform2,DIY,Gigwise,Guardian,LAQ,NME,Ohm,Pitchfork,Skinny,Spectrum,UTR,Uncut
Platform1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
DIY,,407.0,1091.0,729.0,1043.0,488.0,1285.0,707.0,900.0,693.0,133.0
Gigwise,407.0,,316.0,269.0,433.0,295.0,375.0,155.0,279.0,201.0,50.0
Guardian,1091.0,316.0,,715.0,1769.0,571.0,2931.0,1128.0,1954.0,1027.0,709.0
LAQ,729.0,269.0,715.0,,536.0,510.0,1070.0,530.0,654.0,477.0,140.0
NME,1043.0,433.0,1769.0,536.0,,520.0,2056.0,879.0,1339.0,826.0,270.0
Ohm,488.0,295.0,571.0,510.0,520.0,,717.0,291.0,547.0,338.0,121.0
Pitchfork,1285.0,375.0,2931.0,1070.0,2056.0,717.0,,1605.0,4035.0,1782.0,1167.0
Skinny,707.0,155.0,1128.0,530.0,879.0,291.0,1605.0,,1114.0,707.0,211.0
Spectrum,900.0,279.0,1954.0,654.0,1339.0,547.0,4035.0,1114.0,,1254.0,510.0
UTR,693.0,201.0,1027.0,477.0,826.0,338.0,1782.0,707.0,1254.0,,245.0


In [43]:
sample = ["Spectrum", "NME1", "Pitchfork"]
sample_df = dfs["Guardian"]

for s in sample:
    sample_df = pd.merge(sample_df, dfs[s], on=["Artist", "Album"], how="inner")

sample_df.shape[0]

  sample_df = pd.merge(sample_df, dfs[s], on=["Artist", "Album"], how="inner")


801

In [48]:
sample1 = ["Spectrum", "NME1", "Pitchfork", "UTR"]
sample_df1 = dfs["Guardian"]

for s in sample1:
    sample_df1 = pd.merge(sample_df1, dfs[s], on=["Artist", "Album"], how="inner")
    
sample_df1.shape[0]

  sample_df1 = pd.merge(sample_df1, dfs[s], on=["Artist", "Album"], how="inner")


378

In [49]:
sample2 = ["Spectrum", "NME1", "Pitchfork", "Skinny"]
sample_df2 = dfs["Guardian"]

for s in sample2:
    sample_df2 = pd.merge(sample_df2, dfs[s], on=["Artist", "Album"], how="inner")
    
sample_df2.shape[0]

  sample_df2 = pd.merge(sample_df2, dfs[s], on=["Artist", "Album"], how="inner")


359

In [50]:
print(f"UTR: {1 - (378 / 801)}, Skinny: {1 - (359 / 801)}")

UTR: 0.5280898876404494, Skinny: 0.5518102372034956


In [24]:
df = dfs["Pitchfork"]

for key in list(dfs.keys())[1:]:
    
    pf_merged = pd.merge(df, dfs[key], how="inner", on=["Album", "Artist"])
    print(key, df.shape[0])
    print(pf_merged)


NME1 24549
     Platform_x          Artist                              Album Platform_y
0     Pitchfork        the 1975  being funny in a foreign language        NME
1     Pitchfork            tsha                      capricorn sun        NME
2     Pitchfork          daphni                             cherry        NME
3     Pitchfork         shygirl                              nymph        NME
4     Pitchfork          pixies                           doggerel        NME
...         ...             ...                                ...        ...
2051  Pitchfork      kanye west                  late registration        NME
2052  Pitchfork   kings of leon               aha shake heartbreak        NME
2053  Pitchfork  the libertines                     the libertines        NME
2054  Pitchfork            blur                         think tank        NME
2055  Pitchfork     the streets           original pirate material        NME

[2056 rows x 4 columns]
Pitchfork 24549
      Platfo