# Analyzing Language and Texts

[David J. Thomas](mailto:dave.a.base@gmail.com), [thePortus.com](http://thePortus.com)<br />
Instructor of Ancient History and Digital Humanities,<br />
Department of History,<br />
[University of South Florida](https://github.com/usf-portal)

---

## This workbook will...

* Use the `dhelp` module to access the `cltk` and `nltk` modules
* Preprocess the text for analysis
* POS (Part of Speech) tag each text
* Perform word counts
* Analyze several other features of the charter texts

---

## 1) Import Module Dependencies

The cell below loads all other Python packages needed. You **must** run this before any other cells.

In [None]:
from collections import UserList
from IPython.display import clear_output
from nltk.text import Text, TextCollection
from dhelp import LatinText

DB_PATH = 'sqlite://'

## First Time Only Setup

The following cell MUST be run the first time you run this on a new computer. This will automatically use the `cltk` module to download training corpora and other necessary linguistic data.

In [None]:
# installs nltk stopwords module
import nltk
nltk.download('stopwords')

# installs latin corpora/linguistic trainers
LatinText('').setup()

## 2) Configure Models 

In [None]:
engine = sql.create_engine(DB_PATH, encoding='utf-8')
Base = declarative_base()
    

class Charter(Base):
    __tablename__ = 'charters'

    id = sql.Column(sql.String, primary_key=True)
    description = sql.Column(sql.String)
    archive = sql.Column(sql.String)
    language = sql.Column(sql.String)
    scholarly_date_avg = sql.Column(sql.Float)
    text = sql.Column(sql.Text)
    
    def __str__(self):
        return self.text
    
    def __repr__(self):
        # print full description with id and preview of text up to 30th char
        try:
            return '<class \'Charter\' id=\'{}\' text=\'{}...\'>'.format(self.id, self.text[0:60])
        # if text is shorter than 30 chars, print full description with entire text
        except:
            return '<class \'Charter\' id=\'{}\' tex\'{}\'>'.format(self.id, self.text)
    
    @property
    def text_clean(self):
        """Basic text pre-processing, removes stopwords, extra spaces, and numbers, adds macrons"""
        stopwords = []
        with open('stopwords_latin.txt') as text_file:
            for line in text_file.readlines():
                stopwords.append(line)
        remove_chars = ['.', ',', ';', ':', '+', '-']
        altered_text = self.text
        for remove_char in remove_chars:
            altered_text = altered_text.replace(remove_char, '')
        return LatinText(altered_text.lower()
            ).rm_lines(
            ).normalize(
            ).rm_stopwords(stopwords
            ).rm_spaces()
    
    @property
    def text_lemmatized(self):
        """Gets the clean form of the text then transforms all words to their lemmata."""
        return self.text_clean.lemmatize()
    
    @property
    def entities(self):
        """Scans text with cltk's entity recognition and returns a list."""
        return LatinText(self.text).entities()
    
    def longest_common_substring(self, other_string):
        """Returns the longest substring that this and another charter share."""
        return LatinText(self.text).longest_common_substring(other_string)
    
    def compare_minhash(self, other_string):
        """Compares the text minhash similarity of this and another charter."""
        return LatinText(self.text).compare_minhash(other_string)
        
    def word_count(self):
        """Gives a dictionary, each key is a word appearing and the value is the count."""
        return LatinText(str(self.text_lemmatized)).word_count()
    
    def word_count_raw(self):
        """Same as .word_count(), but does not lemmatize words before counting."""
        return LatinText(self.text).word_count()
    
    def clausulae_count(self):
        """Similar to word_count, but instead uses cltk to look for poetic clausulae in prose text."""
        return LatinText(self.text).clausulae()
    
    
class CharterCorpus(UserList):
    """List of individual charter objects, provides methods to aid analysis of the documents."""
    
    def __init__(self, charter_objs):
        # call parent class init function since we are overriding it
        super().__init__()
        # ensure that a list was passed
        try:
            iter(charter_objs)
        except:
            raise Exception('CharterCorpus must be populated an iterable')
        # go through each item and manually append it to the internal list
        for charter_obj in charter_objs:
            self.data.append(charter_obj)
            
    @classmethod
    def load(cls):
        """Queries the db, returns CharterCorpus with all charters. Usage: `CharterCorpus.load()`."""
        # create placeholder list
        charter_list = []
        # open session, read texts from db and store in charters
        session = sessionmaker(bind=engine)()
        for charter_obj in session.query(Charter):
            charter_list.append(charter_obj)
        # close session to free memory and db file
        session.close()
        # create new instance of this class with charter objects
        return cls(charter_list)
            
    @property
    def charter_ids(self):
        """Returns a new list containings ids of all charters in this list of charter objects."""
        id_list = []
        for charter_obj in self:
            id_list.append(charter_obj.id)
        return id_list
    
    def get_by_id(self, charter_id):
        """Returns a single instance of Charter."""
        # iterate each charter
        for charter in self:
            # if match, immediately return charter
            if charter.id == charter_id:
                return charter
        # if no match found, return None
        return None
    
    def get_by_ids(self, id_list):
        """Returns a new instance of CharterCorpus, populated only with charters matching the id_list."""
        filtered_charters = []
        # iterate each charter
        for charter_obj in self:
            # if if id matches, if so, add it to a new list of objects
            if charter_obj.id in id_list:
                filtered_charters.append(charter_obj)
        # use self.__class__ to construct a new instance, rather than CharterCorpus(filtered_charters)
        return self.__class__(filtered_charters)
    
    def minhash_distances(self, print_updates=False):
        """Returns dict with ids as keys and vals are dicts with keys/vals of ids/dists to other charters. e.g...
        {'id 1': {'id 2': 0.5, 'id 3': 0.2}, 'id 2': {'id 1': 0.5, 'id 3': 0.8}, 'id 3': {'id 1': 0.2, 'id 2': 0.8}}
        """
        distance_dict = {}
        counter = 0
        # create empty dicts inside for each charter
        for charter_id in self.charter_ids:
            distance_dict[charter_id] = {}
        # start looping through each charter
        for charter in self:
            # if silent is not flagged, clear cell and print info for new charter
            if print_updates:
                clear_output()
                print('Working on minhash distances for {} ({}/{}) '.format(charter.id, counter + 1, len(self)), end='')
            # start sublooping through other charters to compare against
            for other_charter in self:
                # skip to the next item if the charters are the same
                if charter.id == other_charter.id:
                    continue
                # computer the value with compare_minhash and store it in the dict
                distance_dict[charter.id][other_charter.id] = charter.compare_minhash(str(other_charter.text))
            counter += 1
        # if silent not flagged, print finished message
        if print_updates:
            print(' Done!')
        return distance_dict


print('Models created successfully.')

In [None]:
charter_corpus = CharterCorpus.load()

minhash_distances = charter_corpus.minhash_distances(print_updates=True)

charter_key_counter = 0
for charter_key in minhash_distances:
    if charter_key_counter > 5:
        break
    sub_charter_key_counter = 0
    for sub_charter_key in minhash_distances[charter_key]:
        if sub_charter_key_counter > 5:
            break
        print('{} -> {}: {}'.format(charter_key, sub_charter_key, minhash_distances[charter_key][sub_charter_key]))
        sub_charter_key_counter += 1
    charter_key_counter += 1
    

In [None]:
cleaned_charters = []
lemmatized_charters = []

# open session, read texts from db and store in charters
session = sessionmaker(bind=engine)()

print('Preparing texts (lemmatizing and cleaning)...', end='')
counter = 0
for charter_obj in session.query(Charter):
    counter += 1
    cleaned_charters.append(Text(charter_obj.text_clean.tokenize()))
    lemmatized_charters.append(Text(charter_obj.text_lemmatized.tokenize()))
    if counter % 10 == 0:
        print('.', end='')
session.close()
print('Done!')

# converts charters from list of texts into text collection
cleaned_charters = TextCollection(cleaned_charters)
lemmatized_charters = TextCollection(lemmatized_charters)

In [None]:
print(lemmatized_charters.dispersion_plot(['iesu', 'christi', 'rex', 'deus']))

In [None]:
print('Iesu')
print('Frequency to Document Ratio {}%'.format(round(lemmatized_charters.idf('iesu') * 100, 2)))

print('Concordance: First 10 appearances')
for concordance_appearance in lemmatized_charters.concordance_list('iesu')[0:10]:
    print(concordance_appearance.line)

In [None]:
print('Common Contexts')
print('\n---\nIesu:')
print(lemmatized_charters.common_contexts(['iesu']))
print('\n---\nRex:')
print(lemmatized_charters.common_contexts(['rex']))

In [None]:
print(cleaned_charters.plot(20))

## MORE COMING SOON

For now, try out the network analysis module