In [None]:
#Load in data (adapted from ImportingDataIntoPandas)
import pandas as pd

#Note that for this to work the NoMoreSilence_ProjectData.tsv file needs to be
#in the same directory (folder) that this notebook file is in, and that you started
#the jupyter notebook from.

df = pd.read_csv('NoMoreSilence_ProjectData.tsv', sep='\t')

#this is creating a variable of all the sources, which we'll use to experiment with pulling out
#just the call number for each collection. 

#the for loop will iterate through each source, and use the .split method to create a new list
#with each element (separated by commas, which we specified with .split(', ') -- note 
# comma then space) as a list item.

#this threw an error, because one of the entries was a float not a string. We have to choose to 
#either make it a string or to ignore it. In this code I've made it a string with source = str(source)
#but it may actually be better to ignore it. (with an if else statement)

sources = df['Source']
for source in sources:
    source = str(source)
    s_list = source.split(', ')
    
#Filling out the above to more completely get the call number. Note that this time we are opting to skip
#the row if it has no data for the source, this is contained in the "if type(source) == str:"

collection_list = []
for source in sources:
    if type(source) == str:
        source_list = source.split(', ')
        try:
            if source_list[1] == '':
                collection_list.append(source_list[2])
            else:
                collection_list.append(source_list[1])
        except IndexError:
            collection_list.append('no data')

#the below makes a set from the list, to pull out all the unique values so we can see what the extent of the values
#we're getting. 
#we can see that there are some duplicates due to trailing spaces, so we'll need to fix that. 
            
collection_set = set(collection_list)

#We're almost there, but we want the code to remove trailing spaces and to replace spaces with dashes for cleaner data.
#The below does that.

collection_list = []
for source in sources:
    call_no = 'blank'
    if type(source) == str:
        source_list = source.split(', ')
        try:
            if source_list[1] == '':
                if source_list[2][-1] == ' ':
                    call_no = source_list[2][0:-1].replace(' ', '-')
                else:
                    call_no = source_list[2].replace(' ', '-')
            else:
                if source_list[1][-1] == ' ':
                    call_no = source_list[1][0:-1].replace(' ', '-')
                else:
                    call_no = source_list[1].replace(' ', '-')
        except IndexError:
            call_no = 'no-data'
    collection_list.append(call_no)
    
collection_set = set(collection_list)

#Now we need to take the code above and turn it into a function that will run on the "Source" field
#for every line in the dataframe. We need to define its inputs a little differently, and do the function
#definition.

def get_call_no(row):
    call_no = 'blank'
    if type(row['Source']) == str:
        source_list = row['Source'].split(', ')
        try:
            if source_list[1] == '':
                if source_list[2][-1] == ' ':
                    call_no = source_list[2][0:-1].replace(' ', '-')
                else:
                    call_no = source_list[2].replace(' ', '-')
            else:
                if source_list[1][-1] == ' ':
                    call_no = source_list[1][0:-1].replace(' ', '-')
                else:
                    call_no = source_list[1].replace(' ', '-')
        except IndexError:
            call_no = 'no-data'
    return call_no

#This used the function we just defined above to go through each row in the dataframe and pull out the call_no
#and put it into a new column called 'call_no', which we've defined simply by naming it in the 'df['call_no'] = ...'

df['call_no'] = df.apply(lambda row: get_call_no(row), axis=1)

#using the df.unique method, we can check the same thing we did above using set() -- that there are no repeat values. 

call_nums = df['call_no'].unique()

#and now we can sort it by collection simply by creating a variable that defines all the rows that match a certain 
#collection value, and passing this as a selection of the df variable:

act_up = df['call_no'] == 'MSS-98-47'
df[act_up]

In [None]:
#Rename columns to get rid of some spaces.
df.columns = [x.strip(' ') for x in list(df.columns)]

#Grab one document identifier.
document_id = df[act_up].iloc[0]['Local Identifier']

#Grab that document.
document = df.loc[df['Local Identifier'] == document_id]
print(document)

In [None]:
#Let's take a look at the column "Ocr text" (OCR stands for optical character recognition)
#For more info see here: https://towardsdatascience.com/a-gentle-introduction-to-ocr-ee1469a201aa
#Anyone notice any issues?
str(document["Ocr text"].values[0])

In [None]:
#We need to add spaces between words! But how do we know what is and isn't a word?
document_text = document["Ocr text"].values[0]

#First, what language is this text?
from langdetect import detect
detect(document_text)

In [None]:
#One way would be to get a list of "all" words from GitHub.
#https://github.com/dwyl/english-words
import requests

def load_words():
    target_url = 'https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt'
    response = requests.get(target_url)
    data = response.text
    valid_words = set(data.split())
    
    return valid_words

#english_words = load_words()
#print(english_words)

In [None]:
#Oh no! It didn't work!
#What else could we do?
#Maybe NLP?
import nltk

#This will open a Window to download files for the NLTK package.
#nltk.download()

In [None]:
#Let's split sentences and then do words, one chunk at a time.
from nltk.tokenize import sent_tokenize, word_tokenize

#Get a sentence!
sentences = sent_tokenize(document_text)

#Some look good and others... Not so much.
print(sentences)

In [None]:
#Let's get one sentence.
example_sentence = sentences[5]
print(example_sentence)

In [None]:
#Let's try to split apart the words.
words = word_tokenize(example_sentence)
print(words)

In [None]:
#Let's try a spell checker again, but make it more efficient.
#https://stackoverflow.com/questions/8870261/how-to-split-text-without-spaces-into-list-of-words
import wordninja

spell_checked_words = wordninja.split(example_sentence)
print(spell_checked_words)

In [None]:
#Not perfect, but definitely better!
#Let's put the sentence back together:

corrected_sentence = " ".join(spell_checked_words)
print(corrected_sentence)

#There are better ways to do this that employ deep learning models
#but they are a bit too complicated to go into detail here.
#For example: https://github.com/atpaino/deep-text-corrector

In [None]:
#Can we find out anything else from a sentence?

#Let's look at grammar first (parts of speech, specifically).
nltk.pos_tag(word_tokenize(corrected_sentence))

In [None]:
#What do these abbreviations mean?
nltk.help.upenn_tagset()

In [None]:
#We can also look at the sentence's sentiment.
#That is, positivity, negativity, or neutrality.
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

sia = SIA()
pol_score = sia.polarity_scores(corrected_sentence)
print(pol_score)

#Seems like it is mostly neutral!

In [None]:
#Let's go through and correct each sentence of our OCR.
corrected_ocr = []

for sentence in sentences:
    spell_checked_words = wordninja.split(sentence)
    corrected_sentence = " ".join(spell_checked_words)
    corrected_ocr.append(corrected_sentence)
    
print(corrected_ocr)

In [None]:
#Wow that's pretty long... Can we summarize it?
#Well yes! But it's complicated.
#Great step-by-step tutorial here:
#https://stackabuse.com/text-summarization-with-nltk-in-python/
#But we'll use a pre-existing package, sumy.

from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

LANGUAGE = "english"
SENTENCES_COUNT = 1

full_text = ". ".join(corrected_ocr)
parser = PlaintextParser.from_string(full_text, Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)

for sentence in summarizer(parser.document, SENTENCES_COUNT):
    print(sentence)

In [None]:
#Are there other summary methods?
from sumy.summarizers.edmundson import EdmundsonSummarizer as Edmundson
from sumy.summarizers.luhn import LuhnSummarizer as Luhn

print("\nLuhn:")
summarizer = Luhn(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)

for sentence in summarizer(parser.document, SENTENCES_COUNT):
    print(sentence)

In [None]:
#Slightly better! Turns out text quality is pretty important here.

#Now let's do sentiment analysis on each sentence.
sia = SIA()
polarity_results = []
for sentence in corrected_ocr:
    pol_score = sia.polarity_scores(sentence)
    pol_score["sentence"] = sentence
    polarity_results.append(pol_score)
    
#Load into dataframe.
sentences_df = pd.DataFrame.from_records(polarity_results)
sentences_df.head()

In [None]:
#We can even do this with the entire document.
#But... It might take a while.

#Let's try something simpler with our corrected text.
#Let's stem all the words in the doc.
from nltk.stem import PorterStemmer

ps = PorterStemmer()
stemmed_words = []

for sentence in corrected_ocr:
    for words in word_tokenize(sentence):
            stemmed_words.append(ps.stem(words))
                
print(stemmed_words[:20])

In [None]:
#So we have a lot that doesn't tell us anything... yet.

#Let's start by removing the numbers.
without_numbers = []

for word in stemmed_words:
    new_string = "".join([x for x in word if not x.isdigit()])
    if new_string:
        without_numbers.append(new_string)
        
print(without_numbers[:20])

In [None]:
#Next, let's make everything lowercase.
lowercase_words = [x.lower() for x in without_numbers]
print(lowercase_words[:20])

In [None]:
#Now let's look at word frequency.
unique_words = set(lowercase_words)
word_freq = {}

for word in unique_words:
    word_freq[word] = lowercase_words.count(word)
    
#Sorted by frequency.
sorted_word_freq = sorted(word_freq.items(), key=lambda kv: kv[1], reverse=True)
print(sorted_word_freq[:50])

In [None]:
#Notice anything?
#We need to remove a few things still...

#Like punctuation:
import string
without_punctuation = []

for word in lowercase_words:
    new_string = "".join([x for x in word if x not in string.punctuation])
    if new_string:
        without_punctuation.append(new_string)
        
unique_words = set(without_punctuation)
word_freq = {}
for word in unique_words:
    word_freq[word] = without_punctuation.count(word)
sorted_word_freq = sorted(word_freq.items(), key=lambda kv: kv[1], reverse=True)
print(sorted_word_freq[:50])

In [None]:
#And single letters:
no_single_letters = []

for word in without_punctuation:
    if len(word) > 1:
        no_single_letters.append(word)
        
unique_words = set(no_single_letters)
word_freq = {}
for word in unique_words:
    word_freq[word] = no_single_letters.count(word)
sorted_word_freq = sorted(word_freq.items(), key=lambda kv: kv[1], reverse=True)
print(sorted_word_freq[:50])

In [None]:
#And stopwords:
from nltk.corpus import stopwords

no_stopwords = []

for word in no_single_letters:
    if word not in stopwords.words('english'):
        no_stopwords.append(word)
        
unique_words = set(no_stopwords)
word_freq = {}
for word in unique_words:
    word_freq[word] = no_stopwords.count(word)
sorted_word_freq = sorted(word_freq.items(), key=lambda kv: kv[1], reverse=True)
top_fifty = sorted_word_freq[:50]
print(top_fifty)

In [None]:
#Let's visualize this a bit.
%matplotlib inline
from wordcloud import WordCloud
import matplotlib.pyplot as plt

wc = WordCloud(background_color = "white", max_words = 500)
wc.generate_from_frequencies(word_freq)

plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
#Maybe this isn't giving as much info as we'd like...
#Let's do a barchart instead.
plt.title("Word Frequencies")
plt.ylabel("# of Occurrences")
plt.xlabel("Word")

plt.bar([i[0] for i in top_fifty], [i[1] for i in top_fifty])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
#Okay, now we've got the basics!
#Let's move on to vectorization.
#We'll start with bag of words.

#Actually! We already did that. This is a big of words here:
print(top_fifty)

In [None]:
#However, what we really want is this part:
print([i[1] for i in top_fifty])

#This is our vector.

In [None]:
#Whereas this, is our vocabulary;
print([i[0] for i in top_fifty])

#What information do we lose by vectorizing?
#How could we try to retain that knowledge?

In [None]:
#So do we have to do all of the above steps for every document?
#Thankfully, not really! sklearn and nltk do a lot of that for us.
#(Except the OCR correction... We still have to do that mostly ourselves.)
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

token = RegexpTokenizer(r'[a-zA-z0-9]+')
cv = CountVectorizer(lowercase=True, stop_words='english', ngram_range=(1,1), tokenizer=token.tokenize)


In [None]:
#Let's make sure we remember to use wordninja!

#Let's pick 50 documents from two collections to compare.
#Which two collections should we use?
#It could be interesting to compare a government/legal
#collection to a more personal collection.

unique_collect = set(list(df['call_no']))
collection_freq = {}
for collect in unique_collect:
    collection_freq[collect] = list(df['call_no']).count(collect)
    
print(collection_freq)

#Let's use:
#  2000-46 - AIDS Legal Referral Panel Records (85 records)
#  2003-09 - Linda Alband Collection of Randy Shilts Materials (209 records)

aids_legal_referral_papers = df['call_no'] == '2000-46'
linda_alband_papers = df['call_no'] == '2003-09'

aids_legal_referral_papers_50_random = df[aids_legal_referral_papers].sample(n = 50)
linda_alband_papers_50_random = df[linda_alband_papers].sample(n = 50)

In [None]:
#Now let's correct these.
corrected_ocr_aids_legal = []
for x in aids_legal_referral_papers_50_random["Ocr text"]:
    sentences = sent_tokenize(document_text)
    corrected_sentences = []
    for sentence in sentences:
        spell_checked_words = wordninja.split(sentence)
        corrected_sentence = " ".join(spell_checked_words)
        corrected_sentences.append(corrected_sentence)
    corrected_ocr_aids_legal.append(" ".join(corrected_sentences))
    
corrected_ocr_linda_alband = []
for x in linda_alband_papers_50_random["Ocr text"]:
    sentences = sent_tokenize(document_text)
    corrected_sentences = []
    for sentence in sentences:
        spell_checked_words = wordninja.split(sentence)
        corrected_sentence = " ".join(spell_checked_words)
        corrected_sentences.append(corrected_sentence)
    corrected_ocr_linda_alband.append(" ".join(corrected_sentences))
    
#Let's look at a couple.
print(corrected_ocr_linda_alband[:2])

In [None]:
#Next, we'll paste these to our small dataframes.
aids_legal_referral_papers_50_random["CorrectedOCR"] = corrected_ocr_aids_legal
linda_alband_papers_50_random["CorrectedOCR"] = corrected_ocr_linda_alband

#And then we'll add a column with "0" for dritz_selma and "1" for sue_rochman.
aids_legal_referral_papers_50_random["Class"] = [0] * 50
linda_alband_papers_50_random["Class"] = [1] * 50

#Let's look at a couple.
print(linda_alband_papers_50_random[:2])

In [None]:
#Finally, we'll paste these two dataframes together.
complete_df = pd.concat([aids_legal_referral_papers_50_random, linda_alband_papers_50_random], axis=0)
print(complete_df[:2])

In [None]:
#Now let's finish preprocessing by vectorizing!
text_counts = cv.fit_transform(complete_df['CorrectedOCR'])

print(text_counts[:2])

In [None]:
#Now let's split our data into a training and testing sets.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    text_counts,
    complete_df['Class'],
    test_size = 0.3,
    random_state = 100
)

In [None]:
#Next we'll use a multinomial Naive Bayes classifier to see how well we can predict which set a document is in.
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

clf = MultinomialNB().fit(X_train, y_train)
predicted = clf.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))

In [None]:
#Whoops... That's not so great (unless you got a great random split).
#Feel free to experiment with different test_size parameters!

#Here we'll try TF-IDF instead!
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

tf = TfidfVectorizer()
text_tf = tf.fit_transform(complete_df['CorrectedOCR'])

X_train, X_test, y_train, y_test = train_test_split(
    text_tf, 
    complete_df['Class'], 
    test_size = 0.3,
    random_state = 200
)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

clf = MultinomialNB().fit(X_train, y_train)
predicted = clf.predict(X_test)
print("MultinomialNB Accuracy:", metrics.accuracy_score(y_test, predicted))

In [None]:
#What if we change the classifier type?
#SGD Classifier?
#How many iterations should we use?
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(max_iter=20, tol=1e-3).fit(X_train, y_train)
predicted = clf.predict(X_test)
print("SGDClassifier Accuracy:", metrics.accuracy_score(y_test, predicted))

In [None]:
#We could also try a couple more!
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron

#PassiveAggressiveClassifier
clf = PassiveAggressiveClassifier(tol=1e-3).fit(X_train, y_train)
predicted = clf.predict(X_test)
print("PassiveAggressiveClassifier Accuracy:", metrics.accuracy_score(y_test, predicted))

#Perceptron
clf = Perceptron(tol=1e-3).fit(X_train, y_train)
predicted = clf.predict(X_test)
print("Perceptron Accuracy:", metrics.accuracy_score(y_test, predicted))

In [None]:
#Unfortunately, we probably don't have much time to continue
#this, but with more documents and corrected text, we
#could summarize whole collections, predict if a document
#belongs to a collection, or even generate completely new
#documents! See here:
#https://www.analyticsvidhya.com/blog/2018/03/text-generation-using-python-nlp/