In [42]:
from pyspark.sql import SparkSession
import json

In [43]:
spark = SparkSession.builder.appName("learning-dataframe").getOrCreate()

In [44]:
spark

In [5]:
df = spark.read.option('header',"true").csv("bodylab_search.csv", inferSchema = True)

In [28]:
df.printSchema()

root
 |-- Product_Code: integer (nullable = true)
 |-- Product_Name: string (nullable = true)
 |-- Search_Term: string (nullable = true)
 |-- rank: integer (nullable = true)
 |-- shallow_url: string (nullable = true)
 |-- DetailURL: string (nullable = true)



In [29]:
df.show()

+------------+--------------------+-----------+----+--------------------+--------------------+
|Product_Code|        Product_Name|Search_Term|rank|         shallow_url|           DetailURL|
+------------+--------------------+-----------+----+--------------------+--------------------+
|       24212|Vitamin A-Z (120 ...|  Vitamin-D|   1|https://www.bodyl...|https://www.bodyl...|
|       26332|Vitamin D Tabs (8...|  Vitamin-D|   2|https://www.bodyl...|https://www.bodyl...|
|        7862|Vitamin D3 (180 K...|  Vitamin-D|   3|https://www.bodyl...|https://www.bodyl...|
|       27238|Vitamin B12 (120 ...|  Vitamin-D|   4|https://www.bodyl...|https://www.bodyl...|
|       24208|Vitamin D3 + K2 (...|  Vitamin-D|   5|https://www.bodyl...|https://www.bodyl...|
|       26041|Vitamin D3 + K2 (...|  Vitamin-D|   6|https://www.bodyl...|https://www.bodyl...|
|       24217|Vitamin D3 + K2 (...|  Vitamin-D|   7|https://www.bodyl...|https://www.bodyl...|
|       26627|Vitamin D3-K2 Tro...|  Vitamin-D|   

In [30]:
type(df)

pyspark.sql.dataframe.DataFrame

In [29]:
# selecting individual columns
df_filtered = df.select(["rank","Search_Term"])
list_a = df_filtered.select("Search_Term").distinct().collect()

In [32]:
df.select(["URL","Product_Name"])

AnalysisException: cannot resolve '`URL`' given input columns: [DetailURL, Product_Code, Product_Name, Search_Term, rank, shallow_url];
'Project ['URL, Product_Name#2278]
+- Relation[Product_Code#2277,Product_Name#2278,Search_Term#2279,rank#2280,shallow_url#2281,DetailURL#2282] csv


In [18]:
df.dtypes

[('URL', 'string'),
 ('Product_Name', 'string'),
 ('Sku', 'string'),
 ('Availability', 'int'),
 ('Price', 'double'),
 ('Rating', 'double'),
 ('Review', 'int'),
 ('Image_URL', 'string'),
 ('Old_price', 'string')]

In [24]:
df.describe().show()

+-------+--------------------+--------------------+--------------------+------------+-----------------+------------------+------------------+--------------------+---------+
|summary|                 URL|        Product_Name|                 Sku|Availability|            Price|            Rating|            Review|           Image_URL|Old_price|
+-------+--------------------+--------------------+--------------------+------------+-----------------+------------------+------------------+--------------------+---------+
|  count|                  14|                  14|                  14|          14|               14|                14|                14|                  14|        4|
|   mean|                null|                null|1.420407358403333...|         1.0|41.56285714285714| 3.692857142857143|35.785714285714285|                null|     null|
| stddev|                null|                null|3.370405653534945E11|         0.0| 19.1479380004827|2.0216003887682974| 58.925218019

In [42]:
# check rank order (1,2,3,4.. n) for each search term
df.select("Search_Term").distinct().collect()

[Row(Search_Term='Riegel mit hohem Proteingehalt'),
 Row(Search_Term='Gewichtsgewinner'),
 Row(Search_Term='Kreatin'),
 Row(Search_Term='Riegel+mit+hohem+Proteingehalt'),
 Row(Search_Term='Massengewinner'),
 Row(Search_Term='Protein+Pulver'),
 Row(Search_Term='Gewinner'),
 Row(Search_Term='Protein-Bodybuilding'),
 Row(Search_Term='isoliertes+Eiweiß'),
 Row(Search_Term='Proteinriegel'),
 Row(Search_Term='vor dem Training'),
 Row(Search_Term='Aminos&auml;uren'),
 Row(Search_Term='Aminosäuren'),
 Row(Search_Term='Aminos'),
 Row(Search_Term='Riegel'),
 Row(Search_Term='Eiwei'),
 Row(Search_Term='Bände'),
 Row(Search_Term='zus&auml;tzliches Eiwei&szlig;'),
 Row(Search_Term='vor+dem+Training'),
 Row(Search_Term='Eiwei&szlig;shaker'),
 Row(Search_Term='Proteine'),
 Row(Search_Term='isoliertes+Molkenprotein'),
 Row(Search_Term='isoliertes Molkenprotein'),
 Row(Search_Term='isoliertes Eiwei&szlig;'),
 Row(Search_Term='essentielle Aminos&auml;uren'),
 Row(Search_Term='Eiweißshaker'),
 Row(Search

In [56]:
# rank information incre
df_filtered = df.select(["rank","Search_Term"])
for term in list_a:
    term = term["Search_Term"]
    infered_sum = int(df_filtered.where(df_filtered["Search_Term"] == term).select("rank").groupBy().sum().collect()[0][0])
    n = n = int(df_filtered.where(df_filtered["Search_Term"] == term).count())
    GT_sum = (n)*(n+1)/2
    if infered_sum == int(GT_sum):
        print(f"{term}    :    ordered")
    else:
        print(f"{term}    :    Not ordered") 

Riegel mit hohem Proteingehalt    :    Not ordered
Gewichtsgewinner    :    ordered
Kreatin    :    ordered
Riegel+mit+hohem+Proteingehalt    :    ordered
Massengewinner    :    ordered
Protein+Pulver    :    ordered
Gewinner    :    ordered
Protein-Bodybuilding    :    ordered
isoliertes+Eiweiß    :    ordered
Proteinriegel    :    ordered
vor dem Training    :    Not ordered
Aminos&auml;uren    :    Not ordered
Aminosäuren    :    ordered
Aminos    :    Not ordered
Riegel    :    Not ordered
Eiwei    :    Not ordered
Bände    :    ordered
zus&auml;tzliches Eiwei&szlig;    :    Not ordered
vor+dem+Training    :    ordered
Eiwei&szlig;shaker    :    Not ordered
Proteine    :    ordered
isoliertes+Molkenprotein    :    ordered
isoliertes Molkenprotein    :    Not ordered
isoliertes Eiwei&szlig;    :    Not ordered
essentielle Aminos&auml;uren    :    Not ordered
Eiweißshaker    :    ordered
Protein    :    Not ordered
Protein Pulver    :    Not ordered
Molkenprotein    :    ordered
Vita

In [77]:
#df_filtered.where(df_filtered["Search_Term"] == "Vitamin-D").show()
df_filtered.where(df_filtered["Search_Term"] == "Vitamin-D").count()

1728

In [131]:
df_filtered.where(df_filtered["Search_Term"] == "Vitamin-D").select("rank").collect()[0]

Row(rank=1)

In [154]:
df_filtered.where(df_filtered["Search_Term"] == "Vitamin-D").select("rank").groupby().sum().collect()[0][0]

1493856

In [138]:
df_filtered.where(df_filtered["Search_Term"] == "Vitamin-D").select("rank").count()

1728

In [104]:
[int(s) for s in "Row(sum(rank)=2498730)".split() if s.isdigit()]

[]

In [136]:
type(infered_sum),n, type(n)

(int, 57136, int)

In [145]:
(1728 * 1729)/2, list_a["Vitamin-D"]

TypeError: list indices must be integers or slices, not str

In [157]:
term = "Vitamin-D"

infered_sum = int(df_filtered.where(df_filtered["Search_Term"] == "Vitamin-D").select("rank").groupBy().sum().collect()[0][0])
n = int(df_filtered.where(df_filtered["Search_Term"] == "Vitamin-D").count())
GT_sum = (n)*(n+1)/2
print(infered_sum,GT_sum,n)
if infered_sum == int(GT_sum):
    print(f"{term}    :    ordered")
else:
    print(f"{term}    :    Not ordered") 

1493856 1493856.0 1728
Vitamin-D    :    ordered


In [3]:
df = spark.read.option('header',"true").csv("bodylab_detail.csv", inferSchema = True)

In [7]:
df.count()

14

In [26]:
from pyspark.sql.functions import col,isnan,when,count
columns = list()
for cols in df.dtypes:
    columns.append(cols[0])
    
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in columns]
   ).show()

+---+------------+---+------------+-----+------+------+---------+---------+
|URL|Product_Name|Sku|Availability|Price|Rating|Review|Image_URL|Old_price|
+---+------------+---+------------+-----+------+------+---------+---------+
|  0|           0|  0|           0|    0|     0|     0|        0|       10|
+---+------------+---+------------+-----+------+------+---------+---------+



In [15]:
columns = list()
for col in df.dtypes:
    columns.append(col[0])
columns

['URL',
 'Product_Name',
 'Sku',
 'Availability',
 'Price',
 'Rating',
 'Review',
 'Image_URL',
 'Old_price']

In [25]:
# vitamin D --> 1,2,3,4,5  distinct == len of vitamin D col --> ordered
# vitamin D --> 1,3,4,5,6 dustinct == len of vitamin D col --> but unordered

TypeError: 'Column' object is not callable

In [34]:
for term in list_a:
    term = term["Search_Term"]
    length = df_filtered.where(df_filtered["Search_Term"] == term).count()
    rank_count = df_filtered.where(df_filtered["Search_Term"] == term).select("rank").distinct().count()
    print(f" {term}  --- {length} ---- {rank_count}")

 Riegel mit hohem Proteingehalt  --- 48 ---- 48
 Gewichtsgewinner  --- 32 ---- 32
 Kreatin  --- 5197 ---- 5197
 Riegel+mit+hohem+Proteingehalt  --- 48 ---- 48
 Massengewinner  --- 27 ---- 27
 Protein+Pulver  --- 48 ---- 48
 Gewinner  --- 9 ---- 9
 Protein-Bodybuilding  --- 3120 ---- 3120
 isoliertes+Eiweiß  --- 48 ---- 48
 Proteinriegel  --- 1416 ---- 1416
 vor dem Training  --- 6 ---- 6
 Aminos&auml;uren  --- 48 ---- 48
 Aminosäuren  --- 48 ---- 48
 Aminos  --- 2787 ---- 2787
 Riegel  --- 1882 ---- 1882
 Eiwei  --- 4277 ---- 4277
 Bände  --- 12 ---- 12
 zus&auml;tzliches Eiwei&szlig;  --- 15 ---- 15
 vor+dem+Training  --- 48 ---- 48
 Eiwei&szlig;shaker  --- 43 ---- 43
 Proteine  --- 3128 ---- 3128
 isoliertes+Molkenprotein  --- 48 ---- 48
 isoliertes Molkenprotein  --- 40 ---- 40
 isoliertes Eiwei&szlig;  --- 40 ---- 40
 essentielle Aminos&auml;uren  --- 48 ---- 48
 Eiweißshaker  --- 48 ---- 48
 Protein  --- 11564 ---- 6038
 Protein Pulver  --- 48 ---- 48
 Molkenprotein  --- 3333 ----

In [36]:
term = "Riegel mit hohem Proteingehalt"

infered_sum = int(df_filtered.where(df_filtered["Search_Term"] == term).select("rank").groupBy().sum().collect()[0][0])
n = int(df_filtered.where(df_filtered["Search_Term"] == term).count())
GT_sum = (n)*(n+1)/2
print(infered_sum,GT_sum,n)
if infered_sum == int(GT_sum):
    print(f"{term}    :    ordered")
else:
    print(f"{term}    :    Not ordered") 

3480 1176.0 48
Riegel mit hohem Proteingehalt    :    Not ordered


In [39]:
df_filtered.where(df_filtered["Search_Term"] == term).select("rank").show()

+----+
|rank|
+----+
|  49|
|  50|
|  51|
|  52|
|  53|
|  54|
|  55|
|  56|
|  57|
|  58|
|  59|
|  60|
|  61|
|  62|
|  63|
|  64|
|  65|
|  66|
|  67|
|  68|
+----+
only showing top 20 rows



In [45]:
# cols, file_names as variable
# input json config file, location for csv files, url's --> output json with results 


from pyspark.sql.functions import col,isnan,when,count

def input_csv_file(file_name:str):
    df = spark.read.option('header',"true").csv(file_name, inferSchema = True)
    return df


def input_txt_file(file:str):
    input_list = spark.read.text(file).collect()
    #file2 = spark.read.text(detail_file).collect()
    
    return input_list


def missing_attribute_test(df:str, columns:list):
    
    #columns = ["Product_Code","Product_Name", "Search_Term", "rank","shallow_url","DetailURL"]
    
    return {col:("Missing" if df.filter(df[col].isNull()).count()>0 else "No Missing") for col in columns}


# def missing_attribute_test_detail(df:str, columns:list):
    
#     #columns = ["URL","Product_Name", "Sku", "Availability","Price","Rating","Review","ImageURL","Old_price"]
    
#     return {col:df.filter(df[col].isNull()).count() for col in columns}

    

def non_empty_results_for_each_search_term_test(df:str):
    
    result = dict()
    term_list = df.select("Search_Term").distinct().collect()
    for col in term_list:
        col = col["Search_Term"]
        if int(df.where(df["Search_Term"] == col).count()) == 0:
            result[col] = "Empty"
        else:
            result[col] = "Non-Empty"

    return result


def search_terms_not_in_csv_test(df:str, search_list:list):
# file contains the search terms
# need to match the search terms in the file with the search terms in the csv
    result = dict()
    for search_term in search_list:
        search_term_from_list = search_term["value"]
        
        # find this search term in csv
# df_filtered.where(df_filtered["Search_Term"] == term)

        if df.where(df["Search_Term"] == search_term_from_list):
            result[search_term_from_list] = "Present"
        else:
            result[search_term_from_list] = "Absent"
    
    return result

# are all the products listed included in the scrape?
def products_not_listed_in_scrape_test(df:str, url_list:list):
    
    result = dict()
    for url in url_list:
        url_from_list = url["value"]
        
        if df.where(df["URL"] == url_from_list):
            result[url_from_list] = "Present"
        else:
            result[url_from_list] = "Absent"
            
    return result

def rank_ordering_test(df:str):
    
    result = dict()
    term_list = df.select("Search_Term").distinct().collect()
    for term in term_list:
        term = term["Search_Term"]
        infered_sum = int(df.where(df["Search_Term"] == term).select("rank").groupBy().sum().collect()[0][0])
        n = int(df.where(df["Search_Term"] == term).count())
        GT_sum = (n)*(n+1)/2
        
        if infered_sum == int(GT_sum):
            result[term] = "ordered" 
        else:
            result[term] = "un-ordered" 
    
    return result


In [26]:
#missing_attribute_test_search(df),results_for_each_search_term_test(df),rank_ordering_test(df)
df = input_csv_file(file_name = "bodylab_search.csv")
search_list = input_txt_file(file = "bodylab24_search.txt")
url_list = input_txt_file(file = "bodylab24_detail.txt")
a=missing_attribute_test(df,columns = ["Product_Code","Product_Name", "Search_Term", "rank","shallow_url","DetailURL"])
b=non_empty_results_for_each_search_term_test(df)
c = search_terms_not_in_csv_test(df, search_list)
d = rank_ordering_test(df)

#e = products_not_listed_in_scrape_test(df, url_list)

In [28]:
# result = dict()
# for search_term in search_list:
#     search_term_from_list = search_term["value"]

#     # find this search term in csv
#     # df_filtered.where(df_filtered["Search_Term"] == term)

#     if df.where[df["Search_Term"] == search_term_from_list]:
#         result[search_term] = "Present"
#     else:
#         result[search_term] = "Absent"
#missing_attribute_test(df,columns = ["Product_Code","Product_Name", "Search_Term", "rank","shallow_url","DetailURL"])
type(a)

dict

In [29]:
df = input_csv_file(file_name = "bodylab_detail.csv")
#search_list = input_txt_file(file = "bodylab24_search.txt")
url_list = input_txt_file(file = "bodylab24_detail.txt")
#a=missing_attribute_test(df,columns = ["Product_Code","Product_Name", "Search_Term", "rank","shallow_url","DetailURL"])
#b=non_empty_results_for_each_search_term_test(df)
#c = search_terms_not_in_csv_test(df, search_list)
#d = rank_ordering_test(df)
e = products_not_listed_in_scrape_test(df, url_list)

In [30]:
e

{'https://www.bodylab24.de/100-whey-gold-standard-908g.html': 'Present',
 'https://www.bodylab24.de/100-whey-gold-standard-450g.html': 'Present',
 'https://www.bodylab24.de/serious-mass-5450g.html': 'Present',
 'https://www.bodylab24.de/amino-x-aminosaeuren-bcaa-1000g.html': 'Present',
 'https://www.bodylab24.de/amino-x-aminosaeuren-bcaa-435g.html': 'Present',
 'https://www.bodylab24.de/syntha-6-original-2260g.html': 'Present',
 'https://www.bodylab24.de/100-casein-gold-standard-1820g.html': 'Present',
 'https://www.bodylab24.de/gold-standard-pre-work-out-330g.html': 'Present',
 'https://www.bodylab24.de/n-o-xploder-650g.html': 'Present',
 'https://www.bodylab24.de/protein-superfood-chocolate-peanutbutter-430g.html': 'Present',
 'https://www.bodylab24.de/protein-crisp-bar-10x65g.html': 'Present',
 'https://www.bodylab24.de/gold-standard-bcaa-train-sustain-266g.html': 'Present',
 'https://www.bodylab24.de/protein-bar-10x60g.html': 'Present',
 'https://www.bodylab24.de/100-whey-gold-stan

In [46]:
# input_json = """{
#   "search_csv": "bodylab_search.csv",
#   "detail_csv": "bodylab_detail.csv",
#   "search_txt": "bodylab24_search.txt",
#   "detail_txt": "bodylab24_detail.txt",
#   "tests":{
#     "missing_attribute_test": {
#       "status": 1,
#       "on_file": "search csv",
#       "columns": ["Product_Code","Product_Name", "Search_Term", "rank","shallow_url","DetailURL"]
#     },
#     "non_empty_results_for_each_search_term_test":{
#       "status": 1,
#       "on_file": "search csv"
#     },
#     "search_terms_not_in_csv_test":{
#       "status": 1,
#       "on_file": "search csv"
#     },
#     "rank_ordering_test":{
#       "status": 1,
#       "on_file": "search csv"
#     }
#   }
# }"""

# with open("input.json") as f:
#     json_data = json.load(f)
# print(json_data)
# search_csv = json_data["search_csv"]
# detail_csv = json_data["detail_csv"]
# search_txt_file = json_data["search_txt"]
# detail_txt_file = json_data["detail_txt"]

# for test in json_data["tests"]:
#     if test == "missing_attribute_test"

def json_extract(obj, key):
    """Recursively fetch values from nested JSON."""
    arr = []

    def extract(obj, arr, key):
        """Recursively search for values of key in JSON tree."""
        if isinstance(obj, dict):
            for k, v in obj.items():
                if isinstance(v, (dict, list)):
                    extract(v, arr, key)
                elif k == key:
                    arr.append(v)
        elif isinstance(obj, list):
            for item in obj:
                extract(item, arr, key)
        return arr

    values = extract(obj, arr, key)
    return values



In [60]:
#TODO:1) change input json singature to include csv_type before the type of test - Done
# 2) organize the driver_fn to run tests with minimum if/else statements - Done
# 3) present output json is verbose, add a succint version of result (ex: which columns are missing etc.)
# 4) break the concerns into smaller functions (example: make output_json function to output json)
# 5) inside main_Driver fn incldue : detail driver fn and search driver fn - Done
# 6) replace if/else with dictionary: dict.get(x, default) -  Done
# 7) use try, except, else, finally for handling file and json exceptions
# 8) export jupter ntbk script to vscode and push
# 9) run for detail csv
# 10) separate concerns in run_fns 

run_logic = {
    "missing_attribute_test":missing_attribute_test,
    "non_empty_results_for_each_search_term_test":non_empty_results_for_each_search_term_test,
    "search_terms_not_in_csv_test":search_terms_not_in_csv_test,
    "rank_ordering_test":rank_ordering_test,
    "products_not_listed_in_scrape_test":products_not_listed_in_scrape_test,
    "products_not_listed_in_scrape_test":products_not_listed_in_scrape_test
}



def run_fns(file):
    
    run_id = json_extract(file, "run_id")[0]
    search_csv = json_extract(file, "search_csv")[0]
    detail_csv = json_extract(file, "detail_csv")[0]
    search_txt = json_extract(file, "search_txt")[0]
    detail_txt = json_extract(file, "detail_txt")[0]
    
#     if json_data["tests"]["missing_attribute_test"]["on_file"] == "search_csv":
#         search_columns = json_data["tests"]["missing_attribute_test"]["columns"]
        
#     elif json_data["tests"]["missing_attribute_test"]["on_file"] == "detail_csv":
#         detail_columns = json_data["tests"]["missing_attribute_test"]["columns"]
        
    for search_test in file["tests"]["search_test"]:
        
        search_df = input_csv_file(search_csv)
        search_list = input_txt_file(search_txt)
        
        action = run_logic.get(search_test)
        print(action)
        
        if action == missing_attribute_test and file["tests"]["search_test"]["missing_attribute_test"]["status"] == 1:
            cols = file["tests"]["search_test"]["missing_attribute_test"]["columns"]
            search_missing_attribute = action(df = search_df, columns = cols)
        
        if action == non_empty_results_for_each_search_term_test and file["tests"]["search_test"]["non_empty_results_for_each_search_term_test"]["status"] == 1:
            search_non_empty_attributes = action(df = search_df)
            
        if action == search_terms_not_in_csv_test and file["tests"]["search_test"]["search_terms_not_in_csv_test"]["status"] == 1:
            search_terms_not_in_csv = action(df = search_df, search_list = search_list)
    
        if action == rank_ordering_test and file["tests"]["search_test"]["rank_ordering_test"]["status"] == 1:
            search_rank_order = action(df = search_df)
    
    for detail_test in file["tests"]["detail_test"]:
        
        detail_df = input_csv_file(detail_csv)
        url_list = input_txt_file(detail_txt)
        
        action = run_logic.get(detail_test)
        print(action)
        
        if action == missing_attribute_test and file["tests"]["detail_test"]["missing_attribute_test"]["status"] == 1:
            cols_2 = file["tests"]["detail_test"]["missing_attribute_test"]["columns"]
            detail_missing_attribute = action(df = detail_df, columns = cols_2)
        
        if action == products_not_listed_in_scrape_test and file["tests"]["detail_test"]["products_not_listed_in_scrape_test"]["status"] == 1:
            detail_products_not_listed_in_scrape = action(df = detail_df, url_list = url_list)
            

#     if json_extract(json_data, "detail_txt")[0] == "search_csv":
#     search_df = input_csv_file(file_name = search_csv)
#     search_list = input_txt_file(file = search_txt)

#     search_missing_attribute = missing_attribute_test(search_df,columns = ["Product_Code","Product_Name", "Search_Term", "rank","shallow_url","DetailURL"])

#     search_non_empty_attributes = non_empty_results_for_each_search_term_test(search_df)

#     search_terms_not_in_csv = search_terms_not_in_csv_test(search_df, search_list)

#     search_rank_order = rank_ordering_test(search_df)

    output = {"run_id": f"{run_id}",
        "search_csv": f"{search_csv}",
        "detail_csv": f"{detail_csv}",
        "search_txt": f"{search_txt}",
        "detail_txt": f"{detail_txt}",
        "search_missing_attributes": search_missing_attribute,
        "search_non-empty_attributes":search_non_empty_attributes,
        "search_terms_not_in_csv":search_terms_not_in_csv,
        "search_rank_order":search_rank_order,
        "detail_missing_attribute": detail_missing_attribute,
        "detail_products_not_listed_in_scrape": detail_products_not_listed_in_scrape
    }
    
    return output
# df = input_csv_file(file_name = "bodylab_detail.csv")
#     search_list = input_txt_file(file = "bodylab24_search.txt")
#     url_list = input_txt_file(file = "bodylab24_detail.txt")
#     a=missing_attribute_test(df,columns = ["Product_Code","Product_Name", "Search_Term", "rank","shallow_url","DetailURL"])
#     b=non_empty_results_for_each_search_term_test(df)
#     c = search_terms_not_in_csv_test(df, search_list)
#     d = rank_ordering_test(df)

In [62]:
with open("input2.json") as f:
    json_data = json.load(f)
  
output = []
def run_driver_fn(json_data):
    for file in json_data["runs"]:
        output.append(run_fns(file))
        
run_driver_fn(json_data)

run_index = json_data["run_index"]

output = """
run_index = {0},
runs:{1}
""".format(run_index, output)

<function missing_attribute_test at 0x7f18d7a263a0>
<function non_empty_results_for_each_search_term_test at 0x7f18d7c4d040>
<function search_terms_not_in_csv_test at 0x7f18d7c4d160>
<function rank_ordering_test at 0x7f18d7c4dca0>
<function missing_attribute_test at 0x7f18d7a263a0>
<function products_not_listed_in_scrape_test at 0x7f18d7c4d280>
<function missing_attribute_test at 0x7f18d7a263a0>
<function non_empty_results_for_each_search_term_test at 0x7f18d7c4d040>
<function search_terms_not_in_csv_test at 0x7f18d7c4d160>
<function rank_ordering_test at 0x7f18d7c4dca0>
<function missing_attribute_test at 0x7f18d7a263a0>
<function products_not_listed_in_scrape_test at 0x7f18d7c4d280>


In [63]:
output = """run_index = {0},
runs:{1}
""".format(run_index, output)

In [64]:
with open("output.json", "w") as outfile:
    json.dump(output, outfile, indent = 4)

In [41]:
output

"run_index = 1,\nruns:\nrun_index = 1,\nruns:[{'run_id': 'msje', 'search_csv': 'bodylab_search.csv', 'detail_csv': 'bodylab_detail.csv', 'search_txt': 'bodylab24_search.txt', 'detail_txt': 'bodylab24_detail.txt', 'missing_attributes': {'Product_Code': 'No Missing', 'Product_Name': 'No Missing', 'Search_Term': 'No Missing', 'rank': 'No Missing', 'shallow_url': 'No Missing', 'DetailURL': 'No Missing'}, 'non-empty_attributes': {'Riegel mit hohem Proteingehalt': 'Non-Empty', 'Gewichtsgewinner': 'Non-Empty', 'Kreatin': 'Non-Empty', 'Riegel+mit+hohem+Proteingehalt': 'Non-Empty', 'Massengewinner': 'Non-Empty', 'Protein+Pulver': 'Non-Empty', 'Gewinner': 'Non-Empty', 'Protein-Bodybuilding': 'Non-Empty', 'isoliertes+Eiweiß': 'Non-Empty', 'Proteinriegel': 'Non-Empty', 'vor dem Training': 'Non-Empty', 'Aminos&auml;uren': 'Non-Empty', 'Aminosäuren': 'Non-Empty', 'Aminos': 'Non-Empty', 'Riegel': 'Non-Empty', 'Eiwei': 'Non-Empty', 'Bände': 'Non-Empty', 'zus&auml;tzliches Eiwei&szlig;': 'Non-Empty', '