In [None]:
import pandas as pd
import os
import sys
import hvplot
import hvplot.pandas
from pathlib import Path
from datetime import datetime
import numpy as np
sys.path.append(os.path.relpath(".."))
from adClassifier.utils import make_combined_df, DATA_FOLDER, ImgFromHTMLParser, get_img_from_html
from adClassifier.models import image_classifier, message_classifier

In [None]:
DATA_FOLDER = Path().cwd().parent / "data"
print(str(DATA_FOLDER.absolute()))

In [None]:
df = pd.read_csv(str(DATA_FOLDER / "en-US.csv.gz"))

In [None]:
# retain only ads that are likely to be political
df = df.loc[df.political_probability>0.90]
df.shape

In [None]:
df["created_date"] = df.created_at.apply(lambda x: datetime.strptime(x.split(" ")[0],"%Y-%m-%d"))
df.updated_at = df.updated_at.fillna(df.created_at)
df["updated_date"] = df.updated_at.apply(lambda x: datetime.strptime(x.split(" ")[0],"%Y-%m-%d"))


In [None]:
df["message"] = df.message.fillna("", inplace=False)
df["political"] = df.political.fillna(-1)
df["not_political"] = df.not_political.fillna(-1)
df["title"].fillna("", inplace=True)
df["id"] = df.id.astype(str)
df["paid_for_by"].fillna("", inplace=True)

# fill images from same id
with pd.option_context('max_colwidth',0, "display.width",1000):
    #df["images"] = df.images.apply(lambda x: x if ((isinstance(x, list)) & (len(x)>0)) else "")
    #df.loc[df.images.isnull(),"images"] = df.loc[df.images.isnull()].apply(lambda x: [])
    df["images"] = [ [""] if x is np.NaN else x for x in df['images'] ]
    df["images"] = [ [""] if x==[] else x for x in df['images'] ]
    #print(df.loc[df.title.str.contains("Amanda Stuck"),["id","images"]])
    #print(df.loc[:,["id","images"]])
    df["images"] = df.images.map(lambda x: x[0])
    most_common = df.groupby("id")["images"].agg(lambda x:x.value_counts().index[0]).reset_index()
    #most_common["images"] = most_common.images.apply(lambda x: x)
    df.loc[df.images == "","images"] = np.nan
    # first try to fill from other items with same id
    df.loc[df.images.isnull(),"images"] = df.loc[df.images.isnull(),"id"].map(most_common.set_index("id").images)
    # if we still don't have the image, try to find it from HTML
    df.loc[df.images.isnull(),"images"] = df.loc[df.images.isnull(),"html"].apply(lambda x: get_img_from_html(x))
    # then fill from other items again
    df.loc[df.images.isnull(),"images"] = df.loc[df.images.isnull(),"id"].map(most_common.set_index("id").images)
    #print("Fixed")
    #print(df.loc[df.title.str.contains("Amanda Stuck"),["id","images"]])


In [None]:
df["message_label"] = np.nan

### Combine completions and data

In [None]:
from pathlib import Path
COMPLETIONS_FOLDER = Path("/Users/tjpajala/PycharmProjects/adClassifier/ad_labeling/completions")
completions = {}
print("Looking in {}".format(COMPLETIONS_FOLDER))
json_files = list(COMPLETIONS_FOLDER.rglob("*.json")) 
print("Found {} json files".format(len(json_files)))
print(json_files[0:2])

In [None]:
import json
from tqdm import tqdm
from collections import Counter
for jf in tqdm(json_files):
    with open(jf,"r") as f:
        d = json.load(f)
        c_id = str(d["completions"][0]["id"])
        if len(d["completions"][0]["result"]) > 0:
            val = d["completions"][0]["result"][0]["value"]["choices"]
        else:
            print("Skipping {} with empty result".format(jf.name))
            continue
        #print("ID {}, choice {}".format(jf.name ,val))
        completions[c_id] = val[0]
print("Found total {} completions.".format(len(completions)))
print("Completion distribution: \n{}".format(Counter(completions.values())))

In [None]:
print(sorted(completions.keys())[0:5])
print(sorted(df.id.values[0:5]))

sum(1 for x in completions.keys() if x in df.id.values)

In [None]:
ABSTAIN = -1
DEM = 0
REP = 1


In [None]:
params = yaml.safe_load(open('params.yaml'))['labeling_functions']
DATA_FOLDER = Path(params["data_folder"])
METRICS_FOLDER = Path(params["metrics_folder"])
SCORES_FILE = METRICS_FOLDER / "scores.json"
AUC_FILE = METRICS_FOLDER / "auc.json"
seed = params["seed"]
n_epochs = params["n_epochs"]
log_freq = params["log_freq"]
n_estimators = params["n_estimators"]
class_weight = params["class_weight"]
OUTPUT_SPREADSHEET = params["output_spreadsheet_id"]
OUTPUT_RANGE = params["output_range"]
