<a href="https://colab.research.google.com/github/zmuhls/ccny-data-science/blob/main/assets/notebooks/sample_pythonScript.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Analysis Workflow

This notebook outlines a simple text analysis process that iterates over its results with further analysis.  

1. **Import Libraries**: Load necessary tools for text processing and visualization
2. **Define Functions**: Create helper functions to handle text
3. **Set Variables**: Set file paths and key variables
4. **Read File**: Load the text for analysis
5. **Process Text**: Clean and count word frequencies
6. **Show Term Results**: Display the most frequent words
7. **Use NLTK**: Use the bigram function from the Natural Language Toolkit (NLTK)
8. **Show Bigram Results**: Display the most frequent co-occuring words

______

In [None]:
# Import the necessary libraries to begin the script
import re  # Regular expressions for text processing
from collections import Counter  # Count occurrences of elements like words

# Define the function to split up a text into individual words
def split_into_words(any_chunk_of_text):
    lowercase_text = any_chunk_of_text.lower()  # Convert the text to lowercase to ensure uniformity
    split_words = re.split(r"\W+", lowercase_text)  # Use regular expressions to split the text by any non-word character
    return split_words  # Return the list of words

In [None]:
# Define the filepath and assign variables
tos_file = "/content/nest_tos.txt"

number_of_desired_words = 20

# Create a list of stopwords
stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers',
 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',
 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'mr',
 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'tm', 'said',
 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 've', 'll', 'amp', 'gutenberg', 'project']

# Read in the file
tos_text = open(tos_file, encoding="utf-8").read()
pvp_text = open(pvp_file, encoding="utf-8").read()

In [None]:
# Tokenize and count frequency of meaningful terms in google nest's terms of service or privacy policy
all_of_words = split_into_words(tos_text)
meaningful_words = [word for word in all_of_words if word not in stopwords]
meaningful_words_tally = Counter(meaningful_words)
most_frequent_meaningful_words = meaningful_words_tally.most_common(number_of_desired_words)

# Output results
most_frequent_meaningful_words

[('nest', 253),
 ('services', 226),
 ('products', 153),
 ('may', 102),
 ('terms', 94),
 ('use', 90),
 ('third', 76),
 ('party', 72),
 ('agree', 59),
 ('service', 54),
 ('arbitration', 41),
 ('product', 41),
 ('information', 41),
 ('content', 39),
 ('including', 38),
 ('access', 37),
 ('monitoring', 35),
 ('applicable', 33),
 ('account', 31),
 ('law', 30)]

In [None]:
from nltk import bigrams  # Import NLTK's bigrams function

# Generate bigrams (collocations) from meaningful words
word_bigrams = list(bigrams(meaningful_words))

# Count the frequency of bigrams
bigrams_tally = Counter(word_bigrams)
most_frequent_bigrams = bigrams_tally.most_common(20)

# Output results
most_frequent_bigrams

[(('products', 'services'), 72),
 (('third', 'party'), 64),
 (('services', 'products'), 44),
 (('use', 'services'), 23),
 (('pro', 'monitoring'), 21),
 (('party', 'products'), 19),
 (('nest', 'com'), 18),
 (('nest', 'products'), 17),
 (('applicable', 'law'), 15),
 (('nest', 'may'), 14),
 (('services', 'including'), 13),
 (('https', 'nest'), 12),
 (('third', 'parties'), 12),
 (('use', 'products'), 12),
 (('mobile', 'apps'), 11),
 (('authorised', 'users'), 11),
 (('using', 'services'), 11),
 (('understand', 'agree'), 11),
 (('access', 'use'), 10),
 (('services', 'may'), 10)]