In [1]:
#!/usr/bin/env python
# coding: utf-8

# In[1]:


#!/usr/bin/env python
# coding: utf-8

# --- NOTES -------------------------------------------------------------------
# 1. Update the datasets, dataList
# -----------------------------------------------------------------------------

import os
import re
import sys
import json
import time
import pyspark
from ast import literal_eval
from copy import deepcopy
from datetime import datetime
from pyspark import SparkContext
from pyspark.sql import SQLContext, SparkSession, Row
from pyspark.sql.functions import udf, unix_timestamp, col ,length
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType, FloatType, DateType, TimestampType
from pyspark.sql.functions import mean as _mean, stddev as _stddev, col
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline 
from collections import Counter
import pyspark.sql.functions as f

#import spacy
#from spacy import displacy
#import en_core_web_sm


# -----------------------------------------------------------------------------
# --- Function Definitions Begin ----------------------------------------------

# Function to find mean and stdv for all files
def mean_stdv(df):
    unlist = udf(lambda x: round(float(list(x)[0]),3), DoubleType())
    for i in ["count"]:
        assembler = VectorAssembler(inputCols=[i],outputCol=i+"_Vect")
        scaler = MinMaxScaler(inputCol=i+"_Vect", outputCol=i+"_Scaled")
        pipeline = Pipeline(stages=[assembler, scaler])
        df = pipeline.fit(df).transform(df).withColumn(i+"_Scaled", unlist(i+"_Scaled")).drop(i+"_Vect")
        df_stats = df.select(_mean(col('count_Scaled')).alias('mean'),_stddev(col('count_Scaled')).alias('std')).collect()
        mean = df_stats[0]['mean']
        std = df_stats[0]['std']
        return df_stats 

# Function to sum all count of values for all files
def count_all_values(df):
    res = df.rdd.map(lambda x: (1,x[1])).reduceByKey(lambda x,y: x + y).collect()[0][1]
    return res

# Regex function to check website type
def re_find_website(df,count_all,found_type):
    web_re_rexpr = "WWW\.|\.COM|HTTP\:"
    df_filtered = df.filter(df["val"].rlike(web_re_rexpr))
    if (df_filtered.count() is not 0):
        count_filtered = df_filtered.rdd.map(lambda x: (1,x[1])).reduceByKey(lambda x,y: x + y).collect()[0][1]
        res = float(count_filtered/count_all)
        if (res >= 0.85): 
            found_type = found_type + ["website"]
        return res, found_type, count_filtered 
    else:
        return 0, found_type, 0

# Regex function to check zip type
def re_find_zipCode(df,count_all,found_type):
    zip_re_rexpr = "^\d{5}?$|^\d{5}?-\d\d\d$|^\d{8}?$"
    df_filtered = df.filter(df["val"].rlike(zip_re_rexpr))
    if (df_filtered.count() is not 0):
        count_filtered = df_filtered.rdd.map(lambda x: (1,x[1])).reduceByKey(lambda x,y: x + y).collect()[0][1]
        res = float(count_filtered/count_all)
        if (res >= 0.85): 
            found_type = found_type + ["zip_code"]
        return res, found_type, count_filtered 
    else:
        return 0, found_type, 0

# Regex function to check buildingCode type
def re_find_buildingCode(df,count_all,found_type):
    bc_re_rexpr = "([A-Z])\d\-"
    df_filtered = df.filter(df["val"].rlike(bc_re_rexpr))
    if (df_filtered.count() is not 0):
        count_filtered = df_filtered.rdd.map(lambda x: (1,x[1])).reduceByKey(lambda x,y: x + y).collect()[0][1]
        res = float(count_filtered/count_all)
        if (res >= 0.85): 
            found_type = found_type + ["building_classification"]
        return res, found_type, count_filtered 
    else:
        return 0, found_type, 0 

# Regex function to check phone number type
def re_find_phoneNum(df,count_all,found_type):
    phone_re_rexpr = "^\d{10}?$|^\(\d\d\d\)\d\d\d\d\d\d\d$|^\d\d\d\-\d\d\d\-\d\d\d\d$"
    df_filtered = df.filter(df["val"].rlike(phone_re_rexpr))
    if (df_filtered.count() is not 0):
        count_filtered = df_filtered.rdd.map(lambda x: (1,x[1])).reduceByKey(lambda x,y: x + y).collect()[0][1]
        res = float(count_filtered/count_all)
        if (res >= 0.85): 
            found_type = found_type + ["phone_number"]
        return res, found_type, count_filtered 
    else:
        return 0, found_type, 0

# Regex function to check lat_lon type
def re_find_lat_lon(df,count_all,found_type):
    ll_re_rexpr = "\([-+]?[0-9]+\.[0-9]+\,\s*[-+]?[0-9]+\.[0-9]+\)"
    df_filtered = df.filter(df["val"].rlike(ll_re_rexpr))
    if (df_filtered.count() is not 0):
        count_filtered = df_filtered.rdd.map(lambda x: (1,x[1])).reduceByKey(lambda x,y: x + y).collect()[0][1]
        res = float(count_filtered/count_all)
        if (res >= 0.85): 
            found_type = found_type + ["lat_lon_cord"]
        return res, found_type, count_filtered 
    else:
        return 0, found_type, 0

# Regex function to check street_addrees type
def re_find_street_address(df,count_all,col_length,found_type):
    st_re_rexpr = "\sROAD|\sSTREET|\sPLACE|\sDRIVE|\sBLVD|\sST|\sRD|\sDR|\sAVENUE|\sAVE"
    df_filtered = df.filter(df["val"].rlike(st_re_rexpr))
    if (df_filtered.count() is not 0):
        count_filtered = df_filtered.rdd.map(lambda x: (1,x[1])).reduceByKey(lambda x,y: x + y).collect()[0][1]
        res = float(count_filtered/count_all)
        if (res >= 0.8): 
            if (col_length >= 15):
                found_type = found_type + ["address"]
            elif (col_length < 15):
                found_type = found_type + ["street"]
        return res, found_type, count_filtered 
    else:
        return 0, found_type, 0

# Regex function to check school name type
def re_find_school(df,count_all,found_type):
    school_re_rexpr = "\sSCHOOL|\sACADEMY|HS\s|ACAD|I.S.\s|IS\s|M.S.\s|P.S\s|PS\s|ACADEMY\s"
    df_filtered = df.filter(df["val"].rlike(school_re_rexpr))
    if (df_filtered.count() is not 0):
        count_filtered = df_filtered.rdd.map(lambda x: (1,x[1])).reduceByKey(lambda x,y: x + y).collect()[0][1]
        res = float(count_filtered/count_all)
        if (res >= 0.5): 
            found_type = found_type + ["school_name"]
        return res, found_type, count_filtered 
    else:
        return 0, found_type, 0

# Regex function for checking house number 
def re_find_houseNo(df,count_all,found_type):
    houseNo_re_rexpr = "^\d{2}?$|^\d{3}?$|^\d{4}?$"
    df_filtered = df.filter(df["val"].rlike(houseNo_re_rexpr))
    if (df_filtered.count() is not 0):
        count_filtered = df_filtered.rdd.map(lambda x: (1,x[1])).reduceByKey(lambda x,y: x + y).collect()[0][1]
        res = float(count_filtered/count_all)
        if (res >= 0.85): 
            found_type = found_type + ["house number"]
        return res, found_type, count_filtered 
    else:
        return 0, found_type, 0

# Regex function for checking school subject
def re_find_school_subject(df,count_all,found_type):
    school_subj_re_rexpr = "^ENGLISH$|^ENGLISH\s[0-9]?$|^MATH\s[A-Z$]|^MATH$|^SCIENCE$|^SOCIAL\sSTUDIES$|^ALGEBRA\s[A-Z]$|                            ^CHEMISTRY$|^ASSUMED\sTEAM\sTEACHING$|^EARTH\sSCIENCE$|^GEOMETRY$|^ECONOMICS$|^GLOBAL HISTORY$|                            ^GLOBAL\sHISTORY[A-Z]$|^LIVING ENVIRONMENT$|^PHYSICS$|^US\sGOVERNMENT$|^US\sGOVERNMENT$|^US\sGOVERNMENT\s&|                            ^US\SHISTORY$|^GLOBAL HISTORY\s[0-9]?$"
    df_filtered = df.filter(df["val"].rlike(school_subj_re_rexpr))
    if (df_filtered.count() is not 0):
        count_filtered = df_filtered.rdd.map(lambda x: (1,x[1])).reduceByKey(lambda x,y: x + y).collect()[0][1]
        res = float(count_filtered/count_all)
        print(res)
        if (res >= 0.5): 
            found_type = found_type + ["school subject"]
        return res, found_type, count_filtered 
    else:
        return 0, found_type, 0

# Regex function for checking school level 
def re_find_schoolLevel(df,count_all,found_type):
    schlvl_re_rexpr = "^[K]\-\d?$|^HIGH SCHOOL$|^ELEMENTARY$|^ELEMENTARY SCHOOL$|^MIDDLE SCHOOL$|^TRANSFER\sSCHOOL$|^MIDDLE$|^HIGH\sSCHOOL\sTRANSFERL$|^YABC$|^[K]\-[0-9]{2}$"
    df_filtered = df.filter(df["val"].rlike(schlvl_re_rexpr))
    if (df_filtered.count() is not 0):
        count_filtered = df_filtered.rdd.map(lambda x: (1,x[1])).reduceByKey(lambda x,y: x + y).collect()[0][1]
        res = float(count_filtered/count_all)
        if (res >= 0.85): 
            found_type = found_type + ["school level"]
        return res, found_type, count_filtered
    else:
        return 0, found_type, 0

# --- Functions FOR NLP Starts HERE -------------------------------------------
def nlp_find_person(df,count_all,found_type):
    #Your Code HERE: 
    #Use count_all for percentage calculation
    #Please return two values: (1)percentage of such type in this col AND (2)the type found for this column
    #if found:
#         found_type = found_type + ["person"]
    #if not found:
    return 0, found_type, 0

def nlp_find_business_name(df,count_all,found_type):
    #Your Code HERE:
    return 0, found_type, 0

def nlp_find_vehicle_type(df,count_all,found_type):
    #Your Code HERE:
    return 0, found_type, 0

def nlp_find_color(df,count_all,found_type):
    #Your Code HERE:
    return 0, found_type, 0

def nlp_find_car_make(df,count_all,found_type):
    #Your Code HERE:
    return 0, found_type, 0

def nlp_find_car_model(df,count_all,found_type):
    #Your Code HERE:
    return 0, found_type, 0

def nlp_find_neighborhood(df,count_all,found_type):
    #Your Code HERE:
    return 0, found_type, 0

def nlp_find_borough(df,count_all,found_type):
    #Your Code HERE:
    return 0, found_type, 0

def nlp_find_city(df,count_all,found_type):
    #Your Code HERE:
    return 0, found_type, 0

# --- Function FOR NLP End ------------------------------------------------

# --- Functions FOR LIST COMPARISON Starts HERE -------------------------------
def list_find_school_subject(df,count_all,found_type):
    df_filtered = df.filter(df["val"].isin(ss_keywords))
    if (df_filtered.count() is not 0):
        count_filtered = df_filtered.rdd.map(lambda x: (1,x[1])).reduceByKey(lambda x,y: x + y).collect()[0][1]
        res = float(count_filtered/count_all)
        print(res)
        if (res >= 0.4): 
            found_type = found_type + ["school subject"]
        return res, found_type, count_filtered 
    else:
        return 0, found_type, 0

def list_find_business_name(df,count_all,found_type):
    df_filtered = df.filter(df["val"].isin(biz_keywords))
    if (df_filtered.count() is not 0):
        count_filtered = df_filtered.rdd.map(lambda x: (1,x[1])).reduceByKey(lambda x,y: x + y).collect()[0][1]
        res = float(count_filtered/count_all)
        print(res)
        if (res >= 0.1): 
            found_type = found_type + ["business"]
        return res, found_type, count_filtered 
    else:
        return 0, found_type, 0


def list_find_neighborhood(df,count_all,found_type):
    df_filtered = df.filter(df["val"].isin(nh_keywords))
    if (df_filtered.count() is not 0):
        count_filtered = df_filtered.rdd.map(lambda x: (1,x[1])).reduceByKey(lambda x,y: x + y).collect()[0][1]
        res = float(count_filtered/count_all)
        print(res)
        if (res >= 0.1): 
            found_type = found_type + ["neighborhood"]
        return res, found_type, count_filtered 
    else:
        return 0, found_type, 0

def list_find_area_of_study(df,count_all,found_type):
    df_filtered = df.filter(df["val"].isin(aos_keywords))
    if (df_filtered.count() is not 0):
        count_filtered = df_filtered.rdd.map(lambda x: (1,x[1])).reduceByKey(lambda x,y: x + y).collect()[0][1]
        res = float(count_filtered/count_all)
        print(res)
        if (res >= 0.3): 
            found_type = found_type + ["area of study"]
        return res, found_type, count_filtered 
    else:
        return 0, found_type, 0
    

def list_find_agency(df,count_all,found_type):
    df_filtered = df.filter(df["val"].isin(ca_keywords))
    if (df_filtered.count() is not 0):
        count_filtered = df_filtered.rdd.map(lambda x: (1,x[1])).reduceByKey(lambda x,y: x + y).collect()[0][1]
        res = float(count_filtered/count_all)
        print(res)
        if (res >= 0.1): 
            found_type = found_type + ["city agency"]
        return res, found_type, count_filtered 
    else:
        return 0, found_type, 0

def list_find_location_type(df,count_all,found_type):
    df_filtered = df.filter(df["val"].isin(lt_keywords))
    if (df_filtered.count() is not 0):
        count_filtered = df_filtered.rdd.map(lambda x: (1,x[1])).reduceByKey(lambda x,y: x + y).collect()[0][1]
        res = float(count_filtered/count_all)
        print(res)
        if (res >= 0.1): 
            found_type = found_type + ["location type"]
        return res, found_type, count_filtered 
    else:
        return 0, found_type, 0

def list_find_parks_playgrounds(df,count_all,found_type):
    df_filtered = df.filter(df["val"].isin(pp_keywords))
    if (df_filtered.count() is not 0):
        count_filtered = df_filtered.rdd.map(lambda x: (1,x[1])).reduceByKey(lambda x,y: x + y).collect()[0][1]
        res = float(count_filtered/count_all)
        print(res)
        if (res >= 0.1): 
            found_type = found_type + ["parks and playgrounds"]
        return res, found_type, count_filtered 
    else:
        return 0, found_type, 0

def import_keyword_list(inputDir):
    klist = sc.textFile(inputDir)
    klist = klist.flatMap(lambda x: x.split(",")).collect()
    klist = [x.strip('"') for x in klist]
    klist = [re.sub("\[|\]|\'|\'|" "", "", item)for item in klist]
    klist = [re.sub(" " "", "", item)for item in klist]
    return(klist)

def read_regex_file(inputFile):
    with open(inputFile) as f:
        return(f.read())
    
def get_regex_from_list(lst):
    regex = ""
    for word in lst:
        regex += "\\s"
        regex += word
        regex += "|"
    return(regex)



# --- Function Definitions End ------------------------------------------------
# -----------------------------------------------------------------------------


# -----------------------------------------------------------------------------
# --- MAIN --------------------------------------------------------------------

if __name__ == "__main__":
    # Setting spark context and 
    sc = SparkContext()
    spark = SparkSession \
            .builder \
            .appName("project_task2") \
            .config("spark.some.config.option", "some-value") \
            .getOrCreate()
    sqlContext = SQLContext(sparkContext=spark.sparkContext, sparkSession=spark)


    # Current user path
    env_var = os.environ
    this_user = env_var['USER']

    # Input & output directories
    #inputDirectory = "/user/hm74/NYCColumns/"#sys.argv[1]
    #outputDirectory = "/user/" + this_user + "/project/task2/"#sys.argv[2]
    inputDirectory = "/home/ted/school/big_data/project/big_data_course_project/task2/raw_data/"
    inputFileClusters = "/home/ted/school/big_data/project/big_data_course_project/task2/resources/filename_clusters.json"
    
    input_pp_keywords = "park_playground_keywords"
    input_aos_keywords = "area_of_study_keywords"
    input_ca_keywords = "city_agency_keywords"
    input_ss_keywords = "school_subject_keywords"
    input_sn_keywords = "school_name_keywords"
    input_lt_keywords = "location_type_keywords"
    input_nh_keywords = "neighborhood_keywords"
    input_biz_keywords = "business_keywords"
    input_biz_keywords = "business_keywords"
    
    pp_keywords = import_keyword_list(input_pp_keywords) # parks & playgrounds
    aos_keywords = import_keyword_list(input_aos_keywords) # area of study
    ca_keywords = import_keyword_list(input_ca_keywords) # city agency
    ss_keywords = import_keyword_list(input_ss_keywords) # school subject
    sn_keywords = import_keyword_list(input_sn_keywords) # school name
    lt_keywords = import_keyword_list(input_lt_keywords) # location type
    nh_keywords = import_keyword_list(input_nh_keywords) # neighborhood
    biz_keywords = import_keyword_list(input_biz_keywords) # business name 
    
    #pp_regex = get_regex_from_list(pp_keywords)
    #aos_regex = get_regex_from_list(aos_keywords)
    #ca_regex = get_regex_from_list(ca_keywords)
    #ss_regex = get_regex_from_list(ss_keywords)
    #sn_regex = get_regex_from_list(sn_keywords)
    #lt_regex = get_regex_from_list(lt_keywords)
    #nh_regex = get_regex_from_list(nh_keywords)
    #biz_regex = get_regex_from_list(biz_keywords)
    
    # Output JSON Semantic Schema
    jsonSchema = {
        "column_name": "",
        "semantic_type": [],
        "count": 0
    }

    # Inner semantic schema 
    semanticSchema = {
        "semantic_type": "",
        "label": "",
        "count": 0 
    }

    # Importing cluster3 format it and put it into a list
    #raw_data = sc.textFile("/user/aj2885/Project_Resource/cluster3_labels.tsv")
    raw_data = sc.textFile("true_labels.tsv")
    raw_list = raw_data.map(lambda x: x.split("\t")).collect()

    # Iteration over dataframes begins bu using dataframe file names
    processCount = 1

    # Create schema for raw data before reading into df 
    customSchema = StructType([
                StructField("val", StringType(), True),
                StructField("count", IntegerType(), True)])


#Testing first 10 files
for filerow in raw_list:
    filename = filerow[0]
    #filename = 'qcdj-rwhu.BUSINESS_NAME2.txt.gz'
    if filename == 'bty7-2jhb.Owner_s_House_Zip_Code.txt.gz':
        continue
    labels = literal_eval(filerow[1])
    print("Processing Dataset =========== : ", str(processCount) + ' - ' +filename)
    # Read file to dataset and apply all regex functions
    found_type = []
    fileinfo = []
    regex_res = []
    df = sqlContext.read.format("csv").option("header","false").option("inferSchema", "true").option("delimiter", "\t").schema(customSchema).load(inputDirectory + filename)
    df_stats = mean_stdv(df)
    mean = df_stats[0]['mean']
    std = df_stats[0]['std']
    count_all = count_all_values(df)

    #added col_length which is the average length of the col
    df_length = df.select(_mean(length(col("val"))).alias('avg_length'))
    col_length= df_length.collect()[0][0]

    percentage_website, found_type, type_count_web = re_find_website(df,count_all,found_type)
    percentage_zip, found_type, type_count_zip = re_find_zipCode(df,count_all,found_type)
    percentage_buildingCode, found_type,type_count_building = re_find_buildingCode(df,count_all,found_type)
    percentage_phoneNum, found_type, type_count_phone = re_find_phoneNum(df,count_all,found_type)
    percentage_lat_lon, found_type, type_count_lat_lon = re_find_lat_lon(df,count_all,found_type)
    percentage_add_st, found_type, type_count_add_st = re_find_street_address(df,count_all,col_length,found_type)
    percentage_school_name, found_type, type_count_school_name= re_find_school(df,count_all,found_type)
    percentage_house_no, found_type ,type_count_house_no= re_find_houseNo(df,count_all,found_type)
    percentage_school_lvl, found_type, type_count_school_lvl= re_find_schoolLevel(df,count_all,found_type)
    percentage_school_subject, found_type, type_count_school_subject= re_find_school_subject(df,count_all,found_type)
    
    # moved this block up here -ted
    percentage_area_of_study, found_type, type_count_area_of_study = list_find_area_of_study(df,count_all,found_type)
    percentage_school_subject, found_type, type_count_school_subject= list_find_school_subject(df,count_all,found_type)
    percentage_agency, found_type, type_count_agency= list_find_agency(df,count_all,found_type)
    percentage_location, found_type, type_count_location= list_find_location_type(df,count_all,found_type)
    percentage_neighborhood, found_type, type_count_neighborhood= list_find_neighborhood(df,count_all,found_type)
    percentage_parks_playgrounds, found_type, type_count_parks_playgrounds = list_find_parks_playgrounds(df,count_all,found_type)
    percentage_business_name, found_type, type_count_business= list_find_business_name(df,count_all,found_type)

    
    type_count = type_count_web + type_count_zip + type_count_building + \
            type_count_phone + type_count_lat_lon + type_count_add_st + \
            type_count_school_name + type_count_house_no + \
            type_count_school_lvl + type_count_school_subject + \
            type_count_area_of_study + type_count_neighborhood + \
            type_count_agency + type_count_location + \
            type_count_parks_playgrounds + type_count_business
    
    #give a default value for all other precentages 
    percentage_person = 0
    #percentage_business_name = 0
    percentage_vehicle_type = 0
    percentage_color = 0
    percentage_car_make = 0
    percentage_car_model = 0
    #percentage_neighborhood = 0
    percentage_borough= 0 
    percentage_city = 0
    #percentage_area_of_study = 0
    #percentage_location = 0
    #percentage_agency = 0
    #percentage_parks_playgrounds = 0

    #STEP TWO: NLP LABEL AND LIST CHECK
    # if not found_type:
    #     #ANKUSH PART: NLP CHECK TYPES
    #     percentage_person, found_type, type_count_person = nlp_find_person(df,count_all,found_type)
    #     percentage_business_name, found_type, type_count_business = nlp_find_business_name(df,count_all,found_type)
    #     percentage_vehicle_type, found_type, type_count_vehicle_type = nlp_find_vehicle_type(df,count_all,found_type)
    #     percentage_color, found_type, type_count_color = nlp_find_color(df,count_all,found_type)
    #     percentage_car_make, found_type, type_count_car_make = nlp_find_car_make(df,count_all,found_type)
    #     percentage_car_model, found_type, type_count_car_model = nlp_find_car_model(df,count_all,found_type)
    #     percentage_neighborhood, found_type, type_count_neighborhood = nlp_find_neighborhood(df,count_all,found_type)
    #     percentage_borough, found_type, type_count_borough = nlp_find_borough(df,count_all,found_type)
    #     percentage_city, found_type, type_count_city = nlp_find_city(df,count_all,found_type)
    
    #     #TED PART: LIST or SIMILARITY CHECK TYPEs
    #   percentage_school_subject, found_type, type_count_school_subject= list_find_school_subject(df,count_all,found_type)
    #     percentage_business_name, found_type, type_count_business= list_find_business_name(df,count_all,found_type)
    #   percentage_neighborhood, found_type, type_count_neighborhood= list_find_neighborhood(df,count_all,found_type)
    #   percentage_area_of_study, found_type, type_count_area_of_study = list_find_area_of_study(df,count_all,found_type)
    #   percentage_agency, found_type, type_count_agency= list_find_agency(df,count_all,found_type)
    #   percentage_location, found_type, type_count_location= list_find_location_type(df,count_all,found_type)
    #   percentage_parks_playgrounds, type_count_location_parks_playgrounds = list_find_parks_playgrounds(df,count_all,found_type
    # !!! NOTE: Please remeber to add type_count_XXX back to type_count in LINE 347
    fileinfo.extend([filename,mean,std,count_all,col_length, percentage_website, percentage_zip,percentage_buildingCode,percentage_phoneNum,percentage_lat_lon,percentage_add_st,percentage_school_name,percentage_house_no,percentage_school_lvl,percentage_person,percentage_school_subject,percentage_vehicle_type, percentage_color,percentage_car_make,percentage_car_model,percentage_neighborhood,percentage_borough,percentage_city,percentage_business_name,percentage_area_of_study,percentage_location,percentage_parks_playgrounds,found_type, type_count])
    regex_res.append(fileinfo)
    print(regex_res)
    # USE ME to export the JSON for current dataset
    print("Saving Dataset =============== : ", str(processCount) + ' - ' +filename)
    processCount += 1
    #outJSON = deepcopy(jsonSchema)
    #outJSON["column_name"] = filename
    #outJSON["semantic_type"] = found_type
    #outJSON["count"] = type_count
    #outJSON = sc.parallelize([json.dumps(outJSON)])
    #outJSON.saveAsTextFile(outputDirectory + filename + '/task2.json')



# Output regex function result 
rdd = sc.parallelize(regex_res)
row_rdd = rdd.map(lambda x: Row(x))
df = row_rdd.toDF()
df = df.select(col('_1').alias('coln'))
length = len(df.select('coln').take(1)[0][0])
df = df.select([df.coln[i] for i in range(length)])
df = df.select(col('coln[0]').alias('filename'),col('coln[1]').alias('mean'),col('coln[2]').alias('stdv'),
           col('coln[3]').alias('count_all'),col('coln[4]').alias('col_length'),col('coln[5]').alias('precentage_website'),
           col('coln[6]').alias('precentage_zip'),col('coln[7]').alias('percentage_buildingCode'),col('coln[8]').alias('percentage_phoneNum'),
           col('coln[9]').alias('percentage_lat_lon'),col('coln[10]').alias('percentage_add_st'),col('coln[11]').alias('percentage_school_name'),
           col('coln[12]').alias('percentage_houseNo'),col('coln[13]').alias('percentage_school_lvl'),col('coln[14]').alias('percentage_person'),
           col('coln[15]').alias('percentage_school_subject'),col('coln[16]').alias('percentage_vehicle_type'),col('coln[17]').alias('percentage_color'),
           col('coln[18]').alias('percentage_car_make'),col('coln[19]').alias('percentage_car_model'),
           col('coln[20]').alias('percentage_neighborhood'),col('coln[21]').alias('percentage_borough'),col('coln[22]').alias('percentage_city'),
           col('coln[23]').alias('percentage_business_name'),col('coln[24]').alias('percentage_area_of_study'),col('coln[25]').alias('percentage_location_type'),
           col('coln[26]').alias('percentage_parks_playgrounds'),col('coln[27]').alias('types'), col('coln[28]').alias('types_count')
           )

types_found_count = df.where(col('types') > " ").count()
print(types_found_count)
#df.write.csv('regex_res.csv')




0.2676056338028169
0.04225352112676056
0.028169014084507043
[['vw9i-7mzq.interest3.txt.gz', 0.29444444444444445, 0.3207935422230724, 71, 16.88888888888889, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.028169014084507043, 0.2676056338028169, 0.04225352112676056, 0, [], 24]]
[['tyfh-9h2y.BROOKLYN___COOPERATIVES_COMPARABLE_PROPERTIES___Building_Classification.txt.gz', 0.3785, 0.47786016085601163, 958, 10.5, 0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ['building_classification'], 958]]
[['w7w3-xahh.Location.txt.gz', 8.022479922576566e-05, 0.00413211825588574, 119500, 39.050562847004095, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ['lat_lon_cord'], 119500]]
[['nfkx-wd79.Address_1.txt.gz', 0.014797507788161994, 0.08923417616800781, 1983, 18.692107995846314, 0, 0, 0, 0, 0, 0.880988401412002, 0.011598587997982855, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ['address'], 1770]]
0.117096018735363
0.0117096018735363
0.03044496487119438
0

0.34394654088050314
0.2456761006289308
0.17020440251572327
[['p2d7-vcsb.ACCOUNT_CITY.txt.gz', 0.01819801980198018, 0.07556178382538213, 5088, 9.58085808580858, 0, 0, 0, 0, 0, 0.005699685534591195, 0.00039308176100628933, 0, 0, 0, 0, 0, 0, 0, 0, 0.34394654088050314, 0, 0, 0.17020440251572327, 0, 0, 0.2456761006289308, ['neighborhood', 'parks and playgrounds', 'business'], 3897]]
0.0028873917228103944
[['qcdj-rwhu.BUSINESS_NAME2.txt.gz', 0.008068647540983607, 0.052338488074875206, 1039, 13.487704918032787, 0, 0, 0, 0, 0, 0.03176130895091434, 0.007699711260827719, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0028873917228103944, 0, 0, 0, [], 44]]
3.660556587629149e-05
3.050463823024291e-06
6.100927646048582e-06
[['kj4p-ruqc.StreetName.txt.gz', 0.025680065524193614, 0.060833814228135275, 327819, 13.018397177419354, 0, 3.050463823024291e-06, 0, 0, 0, 0.9247999658348052, 0.005853840076383614, 1.5252319115121454e-05, 0, 0, 0, 0, 0, 0, 0, 3.660556587629149e-05, 0, 0, 6.100927646048582e-06, 0, 0, 3.05046

0.05598755832037325
[['7btz-mnc8.Provider_First_Name.txt.gz', 0.022656000000000058, 0.07418824691812463, 1929, 6.384, 0, 0, 0, 0, 0, 0, 0.002592016588906169, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.05598755832037325, 0, 0, 0.02021772939346812, [], 152]]
[['wg9x-4ke6.Principal_phone_number.txt.gz', 0.001086760280842527, 0.02255314279897806, 2190, 11.995987963891675, 0, 0, 0, 0.9584474885844749, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ['phone_number'], 2099]]
0.25
[['cyfw-hfqk.STATEN_ISLAND_CONDOMINIUM_PROPERTY_Neighborhood.txt.gz', 0.15155555555555555, 0.32460826819071903, 24, 12.11111111111111, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.25, 0, 0, 0, 0, 0, 0, ['neighborhood'], 6]]
7.506404571614853e-06
1.0723435102306932e-06
1.0723435102306932e-05
[['sqcr-6mww.Cross_Street_1.txt.gz', 0.006666392769104305, 0.029173850509321152, 932537, 12.995891536565324, 0, 0, 0, 0, 0, 0.8668224424339195, 0.008757829448054071, 5.361717551153466e-06, 0, 0, 0, 0, 0, 0, 0, 7.506404571614853e-0

[['dvzp-h4k9.COMPARABLE_RENTAL_____1_____Building_Classification.txt.gz', 0.3453333333333333, 0.35017277121011237, 1381, 10.666666666666666, 0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ['building_classification'], 1381]]
0.18827611395178961
0.06525444363282201
[['i8ys-e4pm.CORE_COURSE_9_12_ONLY_.txt.gz', 0.08264999999999999, 0.21670821396523024, 16428, 10.45, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.06525444363282201, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, [], 1072]]
1.1065493001462969e-07
7.745845101024079e-07
0.0001751667542131588
3.9614464945237425e-05
4.4704591725910394e-05
0.00013179002164742396
[['jt7v-77mi.Vehicle_Make.txt.gz', 0.0010480139676996949, 0.02121762121099167, 9037103, 4.34730103302779, 0, 8.852394401170375e-07, 1.1065493001462969e-07, 0, 0, 6.639295800877781e-07, 2.766373250365742e-06, 5.090126780672966e-06, 0, 0, 7.745845101024079e-07, 0, 0, 0, 0, 3.9614464945237425e-05, 0, 0, 0.00013179002164742396, 1.1065493001462969e-07, 0.0001751667542131588, 4.47045917

0.11264367816091954
0.009195402298850575
0.027586206896551724
0.029885057471264367
[['vw9i-7mzq.interest1.txt.gz', 0.10420000000000003, 0.21858410401105516, 435, 16.35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.009195402298850575, 0, 0, 0, 0, 0, 0, 0, 0.029885057471264367, 0.11264367816091954, 0.027586206896551724, 0, [], 78]]
0.4090909090909091
[['bawj-6bgn.BRONX_CONDOMINIUM_PROPERTY_Neighborhood.txt.gz', 0.1921764705882353, 0.28757417201787216, 66, 15.647058823529411, 0, 0, 0, 0, 0, 0, 0.030303030303030304, 0, 0, 0, 0, 0, 0, 0, 0, 0.4090909090909091, 0, 0, 0, 0, 0, 0, ['neighborhood'], 29]]
2.1572824592053573e-06
0.0001787092789205718
1.8984085641007144e-06
0.09090727879182518
[['a5td-mswe.Vehicle_Color.txt.gz', 0.002353610503282274, 0.03732129798318185, 11588654, 3.833698030634573, 0, 0, 0, 0, 0, 3.4516519347285714e-07, 0, 1.7258259673642857e-07, 0, 0, 0, 0, 0, 0, 0, 0.0001787092789205718, 0, 0, 0.09090727879182518, 0, 2.1572824592053573e-06, 1.8984085641007144e-06, [], 1055617]]
1.0297096990

[['hy4q-igkk.School_Name.txt.gz', 0.0007042253521126761, 0.026537244621713762, 1789359, 33.2112676056338, 0, 0, 0, 0, 0, 0.00021124883268254163, 0.0040031094933995915, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, [], 7541]]
0.0015308075009567547
0.00019135093761959434
0.00019135093761959434
0.001148105625717566
[['9b9u-8989.DBA.txt.gz', 0.0035926834253014643, 0.029844161942627494, 5226, 15.670345391375434, 0.00019135093761959434, 0, 0, 0, 0, 0.04286261002678913, 0.020283199387677, 0.00019135093761959434, 0, 0, 0, 0, 0, 0, 0, 0.00019135093761959434, 0, 0, 0.001148105625717566, 0, 0.0015308075009567547, 0.00019135093761959434, [], 348]]
[['ci93-uc8s.Website.txt.gz', 0.0005503558988338636, 0.014130831571006733, 6654, 27.840678479479024, 0.9954914337240758, 0, 0.00030057108506161706, 0, 0, 0.0004508566275924256, 0.0021039975954313195, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ['website'], 6643]]
[['n2s5-fumm.BRONX_CONDOMINIUM_PROPERTY_Building_Classification.txt.gz', 0.52, 0.50119856344

0.17593961148648649
0.07823057432432433
[['6wcu-cfa3.CORE_COURSE__MS_CORE_and_9_12_ONLY_.txt.gz', 0.0702272727272727, 0.20793747499306978, 18944, 11.181818181818182, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.07823057432432433, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, [], 1482]]
[['8wbx-tsch.Website.txt.gz', 0.020525896414342572, 0.09961105524334754, 72690, 19.9203187250996, 0.99889943596093, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ['website'], 72610]]
0.0001262562496843594
0.0001262562496843594
0.0001262562496843594
[['rbx6-tga4.Owner_Street_Address.txt.gz', 0.003337313852205762, 0.026650863539309087, 39602, 16.424416971059287, 0, 0, 0, 0.0001262562496843594, 0, 0.8627342053431645, 0.001818089995454775, 0.002676632493308419, 0, 0, 0, 0, 0, 0, 0, 0.0001262562496843594, 0, 0, 0.0001262562496843594, 0, 0, 0.0001262562496843594, ['address'], 34364]]
0.08841201716738198
[['735p-zed8.CANDMI.txt.gz', 0.2984166666666666, 0.3098738756987747, 116500, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

[['rb2h-bgai.Website.txt.gz', 0.5, 0.0, 32, 23.375, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ['website'], 32]]
0.0021358630828548445
5.547696319103492e-05
0.00011095392638206984
0.001692047377326565
[['tg3t-nh4h.BusinessName.txt.gz', 0.04891516631064929, 0.05023942801263042, 36051, 15.061946902654867, 5.547696319103492e-05, 0, 0, 0, 0, 0.029125405675293335, 0.008820837147374553, 0, 0, 0, 0, 0, 0, 0, 0, 5.547696319103492e-05, 0, 0, 0.001692047377326565, 0, 0.0021358630828548445, 0.00011095392638206984, [], 1514]]
9.44498616687326e-08
0.01063070973026253
1.6056476483684542e-06
0.14340492506914673
0.0003998062644437451
[['kiv2-tbus.Vehicle_Body_Type.txt.gz', 0.0014154634373544465, 0.030604148238140054, 10587628, 3.406613879832324, 0, 0, 0, 0, 0, 0.005591620710512307, 0, 8.6893872735234e-06, 0, 0, 9.44498616687326e-08, 0, 0, 0, 0, 1.6056476483684542e-06, 0, 0, 0.0003998062644437451, 0, 0.01063070973026253, 0.14340492506914673, ['parks and playgrounds'], 1694417]]

[['aiww-p3af.Park_Facility_Name.txt.gz', 0.00042140750105351877, 0.020528212319963927, 1882407, 29.788874841972188, 0, 0, 0, 0, 0, 0.0003617708603931031, 0.003726611726369483, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, [], 7696]]
5.785854393843327e-06
[['tqtj-sjs8.FromStreetName.txt.gz', 0.003671730515191551, 0.02141901216982397, 9160272, 15.769767880732214, 0, 0, 0, 0, 0, 0.8786362457359345, 0.005616645444589418, 0, 0, 0, 0, 0, 0, 0, 0, 5.785854393843327e-06, 0, 0, 0, 0, 0, 0, ['address'], 8100050]]
[['3rfa-3xsf.School_Name.txt.gz', 0.0003878975950349108, 0.019695116019838796, 1783133, 30.06788207913111, 0, 0, 0, 0, 0, 0.00031910126726385527, 0.002836580333603831, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, [], 5627]]
[['sxmw-f24h.Park_Facility_Name.txt.gz', 0.0008326394671107411, 0.028855492841238062, 1793774, 33.740216486261446, 0, 0, 0, 0, 0, 0.0001616703107526366, 0.0030572413247153766, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, [], 5774]]
0.39388460796386665
0.32504993295082

0.19815668202764977
[['jcih-dj9q.QUEENS_____CONDOMINIUMS_COMPARABLE_PROPERTIES_____Neighborhood.txt.gz', 0.08911764705882352, 0.18238234309571688, 434, 10.823529411764707, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.19815668202764977, 0, 0, 0, 0, 0, 0, ['neighborhood'], 86]]
[['tukx-dsca.Address_1.txt.gz', 0.01341324200913242, 0.08427060927138551, 1799, 18.771689497716896, 0, 0, 0, 0, 0, 0.8804891606448026, 0.010561423012784881, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ['address'], 1603]]
0.00010433784594517047
0.0001565067689177557
0.0001565067689177557
0.0001565067689177557
[['vr8p-8shw.DVT_MAKE.txt.gz', 0.004338386901152223, 0.0410448870740389, 76674, 5.303820497271073, 0, 0, 0, 0, 0, 0, 0, 7.825338445887784e-05, 0, 0, 0, 0, 0, 0, 0, 0.0001565067689177557, 0, 0, 0.0001565067689177557, 0, 0.00010433784594517047, 0.0001565067689177557, [], 50]]
[['pdpg-nn8i.BORO.txt.gz', 0.6022000000000001, 0.37453197460297033, 30211, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

[['p6h4-mpyy.PRINCIPAL_PHONE_NUMBER.txt.gz', 0.0011526845637583893, 0.024829596275634865, 1823, 11.995525727069351, 0, 0, 0, 0.9901261656609983, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ['phone_number'], 1805]]
0.00010679284652224988
[['dm9a-ab7w.STREET_NAME.txt.gz', 0.008268856855514628, 0.025638635038538155, 121731, 13.512236004022796, 0, 0, 0, 0, 0, 0.8836533011311827, 0.006333637282204204, 0, 0, 0, 0, 0, 0, 0, 0, 0.00010679284652224988, 0, 0, 0, 0, 0, 0, ['street'], 108352]]
[['weg5-33pj.SCHOOL_LEVEL_.txt.gz', 0.4082, 0.37975413098477284, 1262, 13.2, 0, 0, 0, 0, 0, 0, 1.0, 0, 0.9033280507131537, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ['school_name', 'school level'], 2402]]
4.744968623360635e-05
6.839594412051365e-06
4.274746507532104e-06
8.549493015064207e-06
[['pvqr-7yc4.Vehicle_Make.txt.gz', 0.005551745635910225, 0.054476575317614774, 2339320, 4.147755610972569, 0, 0, 0, 0, 0, 0, 1.282423952259631e-06, 4.274746507532103e-07, 0, 0, 0, 0, 0, 0, 0, 6.839594412051365e-06

0.15688770026390694
[['ic3t-wcy2.Applicant_s_First_Name.txt.gz', 0.0020219328784292076, 0.021675339746336023, 1697947, 7.138351204567609, 0, 0, 1.1778930673336682e-06, 5.889465336668341e-07, 0, 0.000124267718603702, 4.1815203890345225e-05, 0, 0, 0, 0, 0, 0, 0, 0, 0.0005453644901754884, 0, 0, 0.15688770026390694, 1.001209107233618e-05, 1.7668396010005024e-06, 0.07834814632023261, ['business'], 400649]]
0.5865310468916388
0.2833694414864319
0.44905078397532405
0.14981823522931736
[['pgtq-ht5f.CORE_SUBJECT___MS_CORE_and__9_12_ONLY_.txt.gz', 0.2374, 0.4286581388472637, 27233, 6.6, 0, 0, 0, 0, 0, 0.13748026291631477, 0, 0, 0, 0, 0.44905078397532405, 0, 0, 0, 0, 0, 0, 0, 0, 0.2833694414864319, 0, 0.14981823522931736, ['school subject', 'school subject', 'parks and playgrounds'], 27770]]
[['yahh-6yjc.School_Type.txt.gz', 0.5305, 0.41519834617525475, 1722, 7.5, 0, 0, 0, 0, 0, 0, 0.281068524970964, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ['school level'], 2206]]
0.007042253521126761
0.08

0.0006321112515802782
[['imfa-v5pv.School_Name.txt.gz', 0.006361323155216285, 0.07952911532016299, 1582, 26.578244274809162, 0, 0, 0, 0, 0, 0.04804045512010114, 0.938685208596713, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ['school_name'], 1561]]
[['4nft-bihw.Property_Address.txt.gz', 0.0327117247063672, 0.06787248811210186, 12246, 17.343704924788792, 0, 0, 0, 0, 0, 0.8637922586967173, 0.004817899722358321, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ['address'], 10637]]
0.008717484707792074
1.664076691864583e-06
0.14689882229789197
0.0002246503534017187
[['faiq-9dfq.Vehicle_Body_Type.txt.gz', 0.0016011527377521601, 0.032058085957467244, 11417743, 3.345821325648415, 0, 0, 0, 0, 0, 0.004486000429331787, 0, 6.130808864764253e-07, 0, 0, 0, 0, 0, 0, 0, 1.664076691864583e-06, 0, 0, 0.0002246503534017187, 0, 0.008717484707792074, 0.14689882229789197, ['parks and playgrounds'], 1830598]]
0.2579369418625843
0.34251868051758705
0.17625296154547113
[['easq-ubfe.CITY.txt.gz', 0.003042580645161

[['xne4-4v8f.SCHOOL.txt.gz', 0.043712574850299404, 0.20451609143839217, 1743, 28.965868263473055, 0, 0, 0, 0, 0, 0.04991394148020654, 0.5869191049913941, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ['school_name'], 1110]]
0.013663691018400438
[['2v9c-2k7f.DBA.txt.gz', 0.3522745098039216, 0.14673047414679632, 5489, 17.99346405228758, 0, 0, 0, 0, 0, 0.022408453270176716, 0.020404445254144653, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.013663691018400438, 0, 0, 0, [], 310]]
0.011196228638774307
0.14319387153800825
[['him9-7gri.Agency.txt.gz', 0.2597777777777777, 0.22815942282146098, 1697, 3.5555555555555554, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.011196228638774307, 0, 0, ['city agency'], 262]]
0.5190839694656488
0.2595419847328244
0.3893129770992366
0.1297709923664122
[['8i43-kna8.CORE_SUBJECT.txt.gz', 0.2, 0.447213595499958, 524, 6.6, 0, 0, 0, 0, 0, 0.1297709923664122, 0, 0, 0, 0, 0.3893129770992366, 0, 0, 0, 0, 0, 0, 0, 0, 0.2595419847328244, 0, 0.1297709923664122, ['schoo

In [2]:
business_keywords

NameError: name 'business_keywords' is not defined