In [1]:
#!/usr/bin/env python
# coding: utf-8

# --- NOTES -------------------------------------------------------------------
# 1. Update the datasets, dataList
# -----------------------------------------------------------------------------

import os
import re
import sys
import json
import time
import pyspark
from copy import deepcopy
from datetime import datetime
from pyspark import SparkContext
from pyspark.sql import SQLContext, SparkSession, Row
from pyspark.sql.functions import udf, unix_timestamp, col
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, DateType, TimestampType
from pyspark.sql.functions import mean as _mean, stddev as _stddev, col
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline 
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
from pyspark.ml.fpm import FPGrowth
from pyspark.sql.types import StringType
from pyspark.sql.functions import array
import pyspark.sql.functions as f
import csv
from pyspark.sql.functions import *
from pyspark.ml.feature import * 
from pyspark.sql.types import *
from nltk.corpus import stopwords

In [3]:
# -----------------------------------------------------------------------------
# --- Function Definitions Begin ----------------------------------------------

def write_keyword_list_to_txt(lst, dest_filename):
    with open(dest_filename, 'w') as f:
        wr = csv.writer(f, quoting=csv.QUOTE_ALL)
        wr.writerow(lst)

def write_regex_to_file(rx, dest_filename):
    with open(dest_filename, 'w') as f:
        f.write(rx)
        
def get_regex_from_list(lst):
    regex = ""
    for word in lst:
        regex += "\s"
        regex += word
        regex += "|"
    return(regex)

# --- Function Definitions End ------------------------------------------------
# -----------------------------------------------------------------------------

In [4]:
# -----------------------------------------------------------------------------
# --- MAIN --------------------------------------------------------------------

if __name__ == "__main__":
    # Setting spark context and 
    sc = SparkContext()
    spark = SparkSession \
        .builder \
        .appName("project_task1") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    sqlContext = SQLContext(sparkContext=spark.sparkContext, sparkSession=spark)


In [5]:

    # Current user path
    env_var = os.environ
    this_user = env_var['USER']

    # Input & output directories
    #inputDirectory = "/user/hm74/NYCOpenData/"#sys.argv[1]
    inputDirectory = "/home/ted/school/big_data/project/big_data_course_project/task2/raw_data/"
    inputFileClusters = "/home/ted/school/big_data/project/big_data_course_project/task2/resources/filename_clusters.json"
    #outputDirectory = "/user/" + this_user + "/project/task1/"#sys.argv[2]

    # Output JSON Semantic Schema
    semanticSchema = {
        "semantic_type": "",
        "count": 0
    }

    # Importing cluster3 format it and put it into a list
    raw_data = sc.textFile("cluster3.txt")
    raw_list = raw_data.flatMap(lambda x: x.split(",")).collect()
    raw_list = [re.sub("\[|\]|\'|\'|" "", "", item)for item in raw_list]
    raw_list = [re.sub(" " "", "", item)for item in raw_list]
    
    # Iteration over dataframes begins bu using dataframe file names
    processCount = 1

    #df_new = df_split_words.withColumn("word", array(df_split_words["word"]))

    # Create schema for raw data before reading into df 
    customSchema = StructType([
        #StructField("My_array", ArrayType(
        #StructType([StructField("val", StringType())]))   
        StructField("val", StringType(), True),
        StructField("count", IntegerType(), True)])    
 

In [None]:
# area of study filelist (aos)
aos_list = sc.textFile("areas_of_study")
aos_list = aos_list.flatMap(lambda x: x.split(",")).collect()
aos_list = [x.strip('"') for x in aos_list]
aos_list = [re.sub("\[|\]|\'|\'|" "", "", item)for item in aos_list]
aos_list = [re.sub(" " "", "", item)for item in aos_list]

# generate keyword list for areas of study
aos_keyword_list = []
aos_current_list = []
for filename in aos_list: # first. change to map later
    aos_df = sqlContext.read.format("csv").option("header","false").option("inferSchema", "true").option("delimiter", "\t").schema(customSchema).load(inputDirectory + filename)
    aos_df_clean = aos_df.select("val", f.regexp_replace(f.col("val"), "[\$#&,]", "").alias("clean_word"))

    # split words in each row and create new df with one word per row, and count
    aos_df_split_words = aos_df_clean.withColumn('word', f.explode(f.split(f.col('clean_word'), ' ')))\
        .groupBy('word')\
        .count()\
        .sort('count', ascending=False)\
        .filter("word != ''")
        
    aos_current_list = [row['word'] for row in aos_df_split_words.take(20)]
    aos_keyword_list = list(set(aos_keyword_list).union(set(aos_current_list)))
    
# remove stopwords
filtered_aos_keywords = [word for word in aos_keyword_list if word not in stopwords.words('english')]

# write to file
write_keyword_list_to_txt(filtered_aos_keywords, 'area_of_study_keywords')


In [None]:
filtered_aos_keywords

In [None]:
# city agency filelist (ca)
ca_list = sc.textFile("agency_filelist")
ca_list = ca_list.flatMap(lambda x: x.split(",")).collect()
ca_list = [x.strip('"') for x in ca_list]
ca_list = [re.sub("\[|\]|\'|\'|" "", "", item)for item in ca_list]
ca_list = [re.sub(" " "", "", item)for item in ca_list]

# generate keyword list for city agencies
ca_current_list = []
ca_keyword_list = []
for filename in ca_list: # first. change to map later
    print(filename)
    ca_df = sqlContext.read.format("csv").option("header","false").option("inferSchema", "true").option("delimiter", "\t").schema(customSchema).load(inputDirectory + filename)
    ca_df_clean = ca_df.select("val", f.regexp_replace(f.col("val"), "[\-$#&,]", "").alias("clean_word"))

    # split words in each row and create new df with one word per row, and count
    ca_df_split_words = ca_df_clean.withColumn('word', f.explode(f.split(f.lower(f.col('clean_word')), ' ')))\
        .groupBy('word')\
        .count()\
        .sort('count', ascending=False)\
        .filter("word != ''")

    #ca_df_new = ca_df_split_words.withColumn("word", array(ca_df_split_words["word"]))
       
    ca_current_list = [row['word'] for row in ca_df_split_words.take(2)]
    ca_keyword_list = list(set(ca_keyword_list).union(set(ca_current_list)))

# remove stopwords (sw list is lowercase)
filtered_ca_keywords = [word for word in ca_keyword_list if word not in stopwords.words('english')]

# back to uppercase to match 
filtered_ca_keywords = [x.upper() for x in filtered_ca_keywords]

write_keyword_list_to_txt(filtered_ca_keywords, 'city_agency_keywords')
  

In [None]:
filtered_ca_keywords

In [None]:
# parks and playgrounds (pp) filelist
pp_list = sc.textFile("park_playground_filelist")
pp_list = pp_list.flatMap(lambda x: x.split(",")).collect()
pp_list = [x.strip('"') for x in pp_list]
pp_list = [re.sub("\[|\]|\'|\'|" "", "", item)for item in pp_list]
pp_list = [re.sub(" " "", "", item)for item in pp_list]

# generate keyword list for city agencies
pp_current_list = []
pp_keyword_list = []
for filename in pp_list: # first. change to map later
    pp_df = sqlContext.read.format("csv").option("header","false").option("inferSchema", "true").option("delimiter", "\t").schema(customSchema).load(inputDirectory + filename)
    pp_df_clean = pp_df.select("val", f.regexp_replace(f.col("val"), "[\$#&,]", "").alias("clean_word"))
        
    # split words in each row and create new df with one word per row, and count
    pp_df_split_words = pp_df_clean.withColumn('word', f.explode(f.split(f.lower(f.col('val')), ' ')))\
        .groupBy('word')\
        .count()\
        .sort('count', ascending=False)\
        .filter("word != ''")
    #pp_df_new = pp_df_split_words.withColumn("word", array(pp_df_split_words["word"]))
    
        
    pp_current_list = [row['word'] for row in pp_df_split_words.take(40)]
    pp_keyword_list = list(set(pp_keyword_list).union(set(pp_current_list)))

# remove stopwords and other words which may result in a missclassification
# TODO: clean this up...
pp_keyword_list.remove('-')
pp_keyword_list.remove('school')
pp_keyword_list.remove('academy')
pp_keyword_list.remove('charter')
pp_keyword_list.remove('high')
pp_keyword_list.remove('jhs')
pp_keyword_list.remove('middle')
pp_keyword_list.remove('secondary')
pp_keyword_list.remove('senior')
pp_keyword_list.remove('h')
pp_keyword_list.remove('j')
pp_keyword_list.remove('e')
#pp_keyword_list.remove('c')
#pp_keyword_list.remove('f')
#pp_keyword_list.remove('w')
#pp_keyword_list.remove('b')
#pp_keyword_list.remove('r')
#pp_keyword_list.remove('l')
pp_keyword_list.remove('st')
#pp_keyword_list.remove('st.')
filtered_pp_keywords = [word for word in pp_keyword_list if word not in stopwords.words('english')]

# back to uppercase to match original dataset
filtered_pp_keywords = [x.upper() for x in filtered_pp_keywords]

write_keyword_list_to_txt(filtered_pp_keywords, 'park_playground_keywords')

In [None]:
# NEIGHBORHOODS

# neighborhood filelist (nh)
nh_list = sc.textFile("neighborhood_filelist")
nh_list = nh_list.flatMap(lambda x: x.split(",")).collect()
nh_list = [x.strip('"') for x in nh_list]
nh_list = [re.sub("\[|\]|\'|\'|" "", "", item)for item in nh_list]
nh_list = [re.sub(" " "", "", item)for item in nh_list]

# generate keyword list for city agencies
nh_current_list = []
nh_keyword_list = []
for filename in nh_list: # first. change to map later
    nh_df = sqlContext.read.format("csv").option("header","false").option("inferSchema", "true").option("delimiter", "\t").schema(customSchema).load(inputDirectory + filename)
    nh_df_clean = nh_df.select("val", f.regexp_replace(f.col("val"), "[\-$#&,]", "").alias("clean_word"))

    # split words in each row and create new df with one word per row, and count
    nh_df_split_words = nh_df_clean.withColumn('word', f.explode(f.split(f.lower(f.col('clean_word')), ' ')))\
        .groupBy('word')\
        .count()\
        .sort('count', ascending=False)\
        .filter("word != ''")

    #ca_df_new = ca_df_split_words.withColumn("word", array(ca_df_split_words["word"]))
       
    nh_current_list = [row['word'] for row in nh_df_split_words.take(20)]
    nh_keyword_list = list(set(nh_keyword_list).union(set(nh_current_list)))

# remove stopwords (sw list is lowercase)
nh_keyword_list.remove('st')
nh_keyword_list.remove('avenue')
#nh_keyword_list.remove('')
filtered_nh_keywords = [word for word in nh_keyword_list if word not in stopwords.words('english')]

# back to uppercase to match 
filtered_nh_keywords = [x.upper() for x in filtered_nh_keywords]

write_keyword_list_to_txt(filtered_nh_keywords, 'neighborhood_keywords')

In [None]:
# LOCATION TYPE

# location type filelist (lt)
lt_list = sc.textFile("location_type_filelist")
lt_list = lt_list.flatMap(lambda x: x.split(",")).collect()
lt_list = [x.strip('"') for x in lt_list]
lt_list = [re.sub("\[|\]|\'|\'|" "", "", item)for item in lt_list]
lt_list = [re.sub(" " "", "", item)for item in lt_list]

# generate keyword list for city agencies
lt_current_list = []
lt_keyword_list = []
for filename in lt_list: # first. change to map later
    print(filename)
    lt_df = sqlContext.read.format("csv").option("header","false").option("inferSchema", "true").option("delimiter", "\t").schema(customSchema).load(inputDirectory + filename)
    lt_df_clean = lt_df.select("val", f.regexp_replace(f.col("val"), "[\-$#&,]", "").alias("clean_word"))

    # split words in each row and create new df with one word per row, and count
    lt_df_split_words = lt_df_clean.withColumn('word', f.explode(f.split(f.lower(f.col('clean_word')), ' ')))\
        .groupBy('word')\
        .count()\
        .sort('count', ascending=False)\
        .filter("word != ''")

    #ca_df_new = ca_df_split_words.withColumn("word", array(ca_df_split_words["word"]))
       
    lt_current_list = [row['word'] for row in lt_df_split_words.take(100)]
    lt_keyword_list = list(set(lt_keyword_list).union(set(lt_current_list)))

# remove stopwords (sw list is lowercase)
#lt_keyword_list.remove('street')
lt_keyword_list.remove('nyc')
filtered_lt_keywords = [word for word in lt_keyword_list if word not in stopwords.words('english')]

# back to uppercase to match 
filtered_lt_keywords = [x.upper() for x in filtered_lt_keywords]

write_keyword_list_to_txt(filtered_lt_keywords, 'location_type_keywords')

In [None]:
# SCHOOL NAME

# school name type filelist (sn)
sn_list = sc.textFile("school_name_type_filelist")
sn_list = sn_list.flatMap(lambda x: x.split(",")).collect()
sn_list = [x.strip('"') for x in sn_list]
sn_list = [re.sub("\[|\]|\'|\'|" "", "", item)for item in sn_list]
sn_list = [re.sub(" " "", "", item)for item in sn_list]

# generate keyword list for city agencies
sn_current_list = []
sn_keyword_list = []
for filename in sn_list: # first. change to map later
    print(filename)
    sn_df = sqlContext.read.format("csv").option("header","false").option("inferSchema", "true").option("delimiter", "\t").schema(customSchema).load(inputDirectory + filename)
    sn_df_clean = sn_df.select("val", f.regexp_replace(f.col("val"), "[\-$#&,]", "").alias("clean_word"))

    # split words in each row and create new df with one word per row, and count
    sn_df_split_words = sn_df_clean.withColumn('word', f.explode(f.split(f.lower(f.col('clean_word')), ' ')))\
        .groupBy('word')\
        .count()\
        .sort('count', ascending=False)\
        .filter("word != ''")

    #ca_df_new = ca_df_split_words.withColumn("word", array(ca_df_split_words["word"]))
       
    sn_current_list = [row['word'] for row in sn_df_split_words.take(20)]
    sn_keyword_list = list(set(sn_keyword_list).union(set(sn_current_list)))

# remove stopwords (sw list is lowercase)
#lt_keyword_list.remove('street')
#sn_keyword_list.remove('nyc')
filtered_sn_keywords = [word for word in sn_keyword_list if word not in stopwords.words('english')]

# back to uppercase to match 
filtered_sn_keywords = [x.upper() for x in filtered_sn_keywords]

write_keyword_list_to_txt(filtered_sn_keywords, 'school_name_keywords')

In [None]:
# SCHOOL SUBJECT

# school subject type filelist (ss)
ss_list = sc.textFile("school_subject_type_filelist")
ss_list = ss_list.flatMap(lambda x: x.split(",")).collect()
ss_list = [x.strip('"') for x in ss_list]
ss_list = [re.sub("\[|\]|\'|\'|" "", "", item)for item in ss_list]
ss_list = [re.sub(" " "", "", item)for item in ss_list]

# generate keyword list for city agencies
ss_current_list = []
ss_keyword_list = []
for filename in ss_list: # first. change to map later
    print(filename)
    ss_df = sqlContext.read.format("csv").option("header","false").option("inferSchema", "true").option("delimiter", "\t").schema(customSchema).load(inputDirectory + filename)
    ss_df_clean = ss_df.select("val", f.regexp_replace(f.col("val"), "[\-$#&,]", "").alias("clean_word"))

    # split words in each row and create new df with one word per row, and count
    ss_df_split_words = ss_df_clean.withColumn('word', f.explode(f.split(f.lower(f.col('clean_word')), ' ')))\
        .groupBy('word')\
        .count()\
        .sort('count', ascending=False)\
        .filter("word != ''")

    #ca_df_new = ca_df_split_words.withColumn("word", array(ca_df_split_words["word"]))
       
    ss_current_list = [row['word'] for row in ss_df_split_words.take(50)]
    ss_keyword_list = list(set(ss_keyword_list).union(set(ss_current_list)))

# remove stopwords (sw list is lowercase)
#lt_keyword_list.remove('street')
ss_keyword_list.remove('11')
ss_keyword_list.remove('10')
ss_keyword_list.remove('b')
ss_keyword_list.remove('9')
ss_keyword_list.remove('12')
bz_keyword_list.remove('+')
filtered_ss_keywords = [word for word in ss_keyword_list if word not in stopwords.words('english')]

# back to uppercase to match 
filtered_ss_keywords = [x.upper() for x in filtered_ss_keywords]

write_keyword_list_to_txt(filtered_ss_keywords, 'school_subject_keywords')

In [None]:
filtered_ss_keywords

In [6]:
# BUSINESS NAME

# business name filelist (bz)
bz_list = sc.textFile("business_filelist")
bz_list = bz_list.flatMap(lambda x: x.split(",")).collect()
bz_list = [x.strip('"') for x in bz_list]
bz_list = [re.sub("\[|\]|\'|\'|" "", "", item)for item in bz_list]
bz_list = [re.sub(" " "", "", item)for item in bz_list]

# generate keyword list for city agencies
bz_current_list = []
bz_keyword_list = []
for filename in bz_list: # first. change to map later
    print(filename)
    bz_df = sqlContext.read.format("csv").option("header","false").option("inferSchema", "true").option("delimiter", "\t").schema(customSchema).load(inputDirectory + filename)
    bz_df_clean = bz_df.select("val", f.regexp_replace(f.col("val"), "[\-$#&,]", "").alias("clean_word"))

    # split words in each row and create new df with one word per row, and count
    bz_df_split_words = bz_df_clean.withColumn('word', f.explode(f.split(f.lower(f.col('clean_word')), ' ')))\
        .groupBy('word')\
        .count()\
        .sort('count', ascending=False)\
        .filter("word != ''")

    #ca_df_new = ca_df_split_words.withColumn("word", array(ca_df_split_words["word"]))
       
    bz_current_list = [row['word'] for row in bz_df_split_words.take(50)]
    bz_keyword_list = list(set(bz_keyword_list).union(set(bz_current_list)))

# remove stopwords (sw list is lowercase)
bz_keyword_list.remove('nyc')
bz_keyword_list.remove('ny')
bz_keyword_list.remove('engineering')
bz_keyword_list.remove('york')
bz_keyword_list.remove('+')
#bz_keyword_list.remove('(')
#bz_keyword_list.remove(')')

filtered_bz_keywords = [word for word in bz_keyword_list if word not in stopwords.words('english')]
for word in filtered_bz_keywords:
    if len(word) < 3:
        filtered_bz_keywords.remove(word)

# back to uppercase to match 
filtered_bz_keywords = [x.upper() for x in filtered_bz_keywords]

write_keyword_list_to_txt(filtered_bz_keywords, 'business_keywords')

qcdj-rwhu.BUSINESS_NAME2.txt.gz
9b9u-8989.DBA.txt.gz
w9ak-ipjd.Owner_s_Business_Name.txt.gz
tg3t-nh4h.BusinessName.txt.gz
43nn-pn8j.DBA.txt.gz
ci93-uc8s.Vendor_DBA.txt.gz
2bmr-jdsv.DBA.txt.gz
2v9c-2k7f.DBA.txt.gz


In [7]:
filtered_bz_keywords

['DELI',
 'FOOD',
 'TACO',
 'MARKET',
 'HOTEL',
 'DELUX',
 'CORP.',
 'GROUP',
 'CONTRACTING',
 'TRANSIT',
 'TRANSPORTATION',
 'BUILDING',
 'ASSOCIATES',
 'GRILL',
 'DESIGN',
 'RISTORANTE',
 'LIMO',
 'DAVID',
 'GOURMET',
 'GOLDEN',
 'COMMUNICATIONS',
 'BLACK',
 'CLASS',
 'ENTERPRISES',
 'INC',
 'TIME',
 'KITCHEN',
 'BURGER',
 'MANAGEMENT',
 'HUDSON',
 'TRIBECA',
 'RED',
 'WOK',
 'COFFEE',
 'PARTNERS',
 'EXECUTIVE',
 'WINE',
 'SERVICE',
 'CLUB',
 'RESTAURANT',
 'MICHAEL',
 'PUB',
 'WORLDWIDE',
 'STUDIO',
 'CUISINE',
 'PAUL',
 'ELECTRIC',
 'HIGH',
 'P.E.',
 'SERVICES',
 'SUSHI',
 'GOOD',
 'A2B',
 'UNITED',
 'CHINESE',
 'CENTER',
 'BISTRO',
 'P.C',
 'CAR',
 'CITY',
 'ASIAN',
 'C/S',
 'ISLAND',
 'ACAPOLCO',
 'TRATTORIA',
 'GREEN',
 'TAVERNA',
 'PLACE',
 'J.',
 'LLP',
 'JOSEPH',
 'SUPPLY',
 'PARK',
 'ROOM',
 'CHINA',
 'DINER',
 'ARCHITECTS',
 'JEWEL',
 'MEXICAN',
 'CLEANING',
 'ROBERT',
 'JOHN',
 'PIZZERIA',
 'EASTERN',
 'CORP',
 'EXPRESS',
 'LINE',
 'BLUE',
 'SECURITY',
 'VIA',
 'BAKERY',
 

In [8]:
test = get_regex_from_list(filtered_bz_keywords)
test

"\\sDELI|\\sFOOD|\\sTACO|\\sMARKET|\\sHOTEL|\\sDELUX|\\sCORP.|\\sGROUP|\\sCONTRACTING|\\sTRANSIT|\\sTRANSPORTATION|\\sBUILDING|\\sASSOCIATES|\\sGRILL|\\sDESIGN|\\sRISTORANTE|\\sLIMO|\\sDAVID|\\sGOURMET|\\sGOLDEN|\\sCOMMUNICATIONS|\\sBLACK|\\sCLASS|\\sENTERPRISES|\\sINC|\\sTIME|\\sKITCHEN|\\sBURGER|\\sMANAGEMENT|\\sHUDSON|\\sTRIBECA|\\sRED|\\sWOK|\\sCOFFEE|\\sPARTNERS|\\sEXECUTIVE|\\sWINE|\\sSERVICE|\\sCLUB|\\sRESTAURANT|\\sMICHAEL|\\sPUB|\\sWORLDWIDE|\\sSTUDIO|\\sCUISINE|\\sPAUL|\\sELECTRIC|\\sHIGH|\\sP.E.|\\sSERVICES|\\sSUSHI|\\sGOOD|\\sA2B|\\sUNITED|\\sCHINESE|\\sCENTER|\\sBISTRO|\\sP.C|\\sCAR|\\sCITY|\\sASIAN|\\sC/S|\\sISLAND|\\sACAPOLCO|\\sTRATTORIA|\\sGREEN|\\sTAVERNA|\\sPLACE|\\sJ.|\\sLLP|\\sJOSEPH|\\sSUPPLY|\\sPARK|\\sROOM|\\sCHINA|\\sDINER|\\sARCHITECTS|\\sJEWEL|\\sMEXICAN|\\sCLEANING|\\sROBERT|\\sJOHN|\\sPIZZERIA|\\sEASTERN|\\sCORP|\\sEXPRESS|\\sLINE|\\sBLUE|\\sSECURITY|\\sVIA|\\sBAKERY|\\sENGINEER|\\sAIA|\\sENVIRONMENTAL|\\sOLLIE'S|\\sBROOKLYN|\\sARCHITECTURAL|\\sAVENUE|\\sNE

In [None]:
def read_regex_file(inputFile):
    with open(inputFile) as f:
        return(f.read())

In [None]:
test_regex = read_regex_file('business_regex')
test_regex

In [None]:

# other option: do this instead of above if you want to mine frequent itemsets later
# (probably useful for strings with at least a few words)
#pp_df_split_words = pp_df.withColumn('word', f.explode(f.split(f.col('val'), ' ')))\
#    .select('word')
#pp_df.show()
#pp_df_new = pp_df.select(col("val"), split(col("val"), " \s*").cast(ArrayType(StringType())).alias("word"))
pp_df_new = pp_df.withColumn("word", array(pp_df["val"]))


fpGrowth = FPGrowth(itemsCol="word", minSupport=0.00, minConfidence=0.0)
model = fpGrowth.fit(pp_df_new)
model.freqItemsets.show()
#pp_df_new.show()
#model.associationRules.show()
#model.transform(pp_df_new).show()

In [None]:

# other option: do this instead of above if you want to mine frequent itemsets later
# (probably useful for strings with at least a few words)
#df_split_words = df.withColumn('word', f.explode(f.split(f.col('val'), ' ')))\
#    .select('word')
#df_new = df_split_words.withColumn("word", array(df_split_words["word"]))
#fpGrowth = FPGrowth(itemsCol="word", minSupport=0.1, minConfidence=0.3)
#model = fpGrowth.fit(df_new)
#model.freqItemsets.show()
#df_new.show()
