In [30]:
'''
Ranks the amount of changes for all 130 queries based on the get difference json file (The python script
that creates the json file will take care of the two dates part)
**Run the getPair method for each new SERP Collection folder entry.
'''

'\nRanks the amount of changes for all 130 queries based on the get difference json file (The python script\nthat creates the json file will take care of the two dates part)\n'

In [31]:
import json
import os

In [32]:
# Load the json file containing changes for a specific query term between the specified previous and current date
# changes is a dictionary with key being organic and other components, value being a dictionary. 
#   For non-organic components, the key is change type and the value are appear or disappear. For organic components, the key is domain name and the value is another dictionary whose keys are title, pos1, pos2, change (amount), change_type.
def getChanges(filename):    
    with open(f'serp-scraper-get-difference/changes/{filename}') as inFile:  #'filename is query_changebetween_date1_and_date2.json (e.g. Ace_changebetween_6-8-22_and_6-9-22.json')
        changes = json.load(inFile)
    organic = changes['organic']
    return getChangeScore(organic)

In [33]:
# Give each query a change score for the amount of changes it undergone between the two consecutive given dates
# appear if given 3 points; movement of less than 5 positions is given 1 point; movement of greater or equal to 5 positions is given 2 points
# the number of appear and movement are recorded
def getChangeScore (organic): 
    changeScore = 0
    appear = 0
    move = 0
    for item in organic:
        if organic[item]['change_type'] != 'unchanged':
            #print(organic[item]['change_type'])
            if organic[item]['change_type'] == 'appear':
                changeScore += 3
                appear+=1
            if organic[item]['change_type'] == 'move':
                move+=1
                if abs(organic[item]['change']) < 5:
                    changeScore += 1
                else:
                    changeScore += 2

    return changeScore, appear, move

In [34]:
#getChanges('Ace_changebetween_6-8-22_and_6-9-22.json')

(8, 2, 2)

In [35]:
'''
Generate the json file for the rank of the changes for all queries between the two given dates (rankforquerychangebetween_date1_and_date2.json) and sort the queries based on their change score (Higher score is ranked higher)
# scores is a dictionary with the key being the query and the value being a dictionary whose keys are score (change score), appear (count), and move (count).
# categories_date1 is all the category folders (Identities, Relationship) for the earlier date; categories_date2 is all the category folders (Identities, Relationship) for the later date;
# category is the individual category folder (e.g. Identities)
# files is all the query.html files inside each category folder; file is the individual query.html file
# query is the query term without of the .html extension
'''

def generateRankForQueryChangeBetweenTwoDates(date1, date2):
    
    scores = {}
    categories_date1 = os.listdir(f'SERP_Collection/{date1}') # earlier date
    categories_date2 = os.listdir(f'SERP_Collection/{date2}') # later date
    for category in categories_date2: # Go through each category that is in both of the date folders
            if not category.startswith('.') and category in categories_date1: # ignore .DStore and make sure the category exists for both dates
                files = os.listdir(f"SERP_Collection/{date2}/{category}")
                files.sort()
                for file in files:
                    if not file.startswith('.'): # ignore .DStore
                        query = file[:file.index('.')] # get rid of the .html extension
                        try:
                            scores[query] = {}
                            score, appear, move = getChanges(f"{query}_changebetween_{date1}_and_{date2}.json")
                            scores[query]['score'] = score
                            scores[query]['appear'] = appear
                            scores[query]['move'] = move
                        except: # if the query.html file is not in both date folders
                            print("file not found")
                            scores[query]['score'] = -1
                            scores[query]['appear'] = -1
                            scores[query]['move'] = -1

    
            #print(query)

    # for k_v in scores.items():
    #     print(k_v[1]['score'])
    #scores = sorted(scores,key=lambda x:scores[x]['score'], reverse=True)
    
    # Sort the queries based on their change score (Higher score is ranked higher)
    scores = dict(sorted(scores.items(), key=lambda k_v: k_v[1]['score'], reverse=True))
    i=1
    for item in scores:
        scores[item]['rank'] = i
        i+=1
    print(scores)

    with open(f'ranked-queries/rankforquerychangebetween_{date1}_and_{date2}.json', "w") as outfile:  ##UNCOMMENT LATER
            json.dump(scores, outfile)

In [36]:
#generateRankForQueryChangeBetweenTwoDates("6-8-22", "6-9-22")

In [37]:
# Sort the folders based on date
folders = os.listdir('SERP_Collection')
folders.remove(".DS_Store")
#print(folders)
datesList = []
for folder in folders:
    dateComponents = folder.split("-")
    datesList.append({'month':int(dateComponents[0]), 'day':int(dateComponents[1]), 'year':int(dateComponents[2])})
datesList = sorted(datesList, key=lambda x: (x['year'], x['month'], x['day']),reverse=True)
sortedDates = []
for d in datesList:
    sortedDates.append(str(d['month'])+'-'+str(d['day'])+'-'+str(d['year']))
folders = sortedDates 
folders

['6-13-22', '6-9-22', '6-8-22']

In [38]:
# Gets all the available date pairs and generate a rank for change of all queries between the two given dates json file for each date pair.
def getPairs(folders):
    pairs = []
    print(os.getcwd())
    for i in range(0,len(folders)-1):
            print(folders[i+1] + " " + folders[i])
            generateRankForQueryChangeBetweenTwoDates(folders[i+1], folders[i])
            print()

In [39]:
getPairs(folders)

/Users/amyhu/Desktop/identities-and-relationships
6-9-22 6-13-22
file not found
file not found
{'Gender-reaffirming resources free': {'score': 24, 'appear': 7, 'move': 3, 'rank': 1}, 'they them': {'score': 17, 'appear': 4, 'move': 5, 'rank': 2}, 'What is trauma bonding': {'score': 12, 'appear': 3, 'move': 3, 'rank': 3}, 'Asexual': {'score': 11, 'appear': 2, 'move': 5, 'rank': 4}, 'Bisexual': {'score': 11, 'appear': 2, 'move': 5, 'rank': 5}, 'How to create space for LGBTQ+ community': {'score': 11, 'appear': 1, 'move': 8, 'rank': 6}, 'Toxic relationship': {'score': 10, 'appear': 1, 'move': 7, 'rank': 7}, 'Non-binary Gender Queer': {'score': 10, 'appear': 2, 'move': 4, 'rank': 8}, 'Omnisexual': {'score': 10, 'appear': 3, 'move': 1, 'rank': 9}, 'Hotlines for those in abusive relationships': {'score': 9, 'appear': 2, 'move': 3, 'rank': 10}, 'Cisgender': {'score': 9, 'appear': 0, 'move': 9, 'rank': 11}, 'Demisexual': {'score': 9, 'appear': 2, 'move': 3, 'rank': 12}, 'Gender neutral gender i