## Privacy Policy Analysis

In [1]:
import textstat as ts
import pandas as pd

In [2]:
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/veronicanutting/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/veronicanutting/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Note: I considered and worked a bit with web-scraping tools like Beautiful Soup and PDF scraping tools like PyPDF2 to try to find broad ways to systemize my analysis. I found that the Privacy Policies varied significantly in structure and organization, so I decided to start with a less universalizable approach first.

In [3]:
# Given text file, return text as long string
def readText(filename):
    with open(filename) as file:
        text = file.read()
    return text

In [4]:
# Count number of lines in text not stripped of '\n'
def countLines(text):
    return text.count('\n')

In [5]:
# Calculates various readabillity metrics
# Link to explanations: https://pypi.org/project/textstat/
def calculateComplexity(text):
    
    # The Flesch Reading Ease formula
    # 90-100 is Very Easy, 0-29 is Very Confusing
    # Max score is 121.22, no limit on how low score can be
    print('Flesch:',ts.flesch_reading_ease(text))
    
    print('Dale-Chall:',ts.dale_chall_readability_score(text))
    
    # Based upon all the above tests, returns the estimated school grade level required to understand the text.
    print('Consensus:',ts.text_standard(text, float_output=True))
    print('Reading Time:',ts.reading_time(text, ms_per_char=14.69)/60)

In [6]:
# Given cleaned list of words, return counts
def findCommonWords(words_list):
    return Counter(cleaned_words).most_common(10)

In [7]:
def findLongestWord(words_list):
    words_list = set(words_list)
    sorted_words = sorted(words_list, key = len)
    
    max_word_length = len(sorted_words[-1])
    max_words = []
    
    for i in range(len(sorted_words) - 1, -1, -1) :
        if len(sorted_words[i])==max_word_length:
            max_words.append(sorted_words[i])
        else:
            break

    # Print longest (last) word
    return max_words

In [8]:
# Return list of cleaned words
def cleanWords(text):
    
    # 1. Remove punctuation, change to lowercase
    words = (re.sub("[^a-zA-Z]", " ", text)).lower().split()

    # 2. Remove common, stop words
    cleaned_words = []
    for word in words:
        if word not in stop_words:
            cleaned_words.append(word)
    
    # 3. Lemmatize words to remove plurals for analysis
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = []
    for word in cleaned_words:
        word = lemmatizer.lemmatize(word)
        lemmatized_words.append(word)
        
    return words,lemmatized_words

In [9]:
policies = {'BBC':'bbc',
            'Harvard Book Store':'bookstore',
            'Wired':'conde',
            'Ebay':'ebay',
            'YouTube':'google',
            'Kayak':'kayak',
            'StackOverflow':'stack',
            'Sweetgreen':'sweetgreen',
            'Target':'target',
            'Zappos':'zappos',
            'Zoom':'zoom'}

In [10]:
# Analyze each policy
for p in sorted(policies.keys()):
    print(p)
    text = readText("policies/"+str(policies[p])+".txt")
    print('Linecount:',countLines(text))
    calculateComplexity(text)
    words,cleaned_words = cleanWords(text)
    print('Wordcount:',len(words))
    print('Unique words:',len(set(words)))
    print('Cleaned Wordcount:',len(cleaned_words))
    print('Cleaned Unique words:',len(set(cleaned_words)))
    print('Longest word:',findLongestWord(cleaned_words))
    print('Most common words:',findCommonWords(cleaned_words))
    print()

BBC
Linecount: 378
Flesch: 58.15
Dale-Chall: 6.85
Consensus: 13.0
Reading Time: 4.079666666666666
Wordcount: 3730
Unique words: 666
Cleaned Wordcount: 1726
Cleaned Unique words: 530
Longest word: ['personalisation']
Most common words: [('information', 83), ('bbc', 60), ('service', 56), ('use', 43), ('u', 40), ('might', 35), ('personal', 34), ('cooky', 24), ('account', 24), ('like', 21)]

Ebay
Linecount: 371
Flesch: 19.98
Dale-Chall: 7.59
Consensus: 21.0
Reading Time: 10.991333333333333
Wordcount: 8357
Unique words: 1239
Cleaned Wordcount: 4818
Cleaned Unique words: 1046
Longest word: ['networkadvertising']
Most common words: [('data', 235), ('personal', 113), ('service', 109), ('ebay', 99), ('user', 77), ('information', 66), ('use', 61), ('may', 48), ('e', 46), ('third', 46)]

Harvard Book Store
Linecount: 8
Flesch: 41.19
Dale-Chall: 9.48
Consensus: 13.0
Reading Time: 0.22133333333333333
Wordcount: 176
Unique words: 107
Cleaned Wordcount: 89
Cleaned Unique words: 77
Longest word: ['res