## Harry Potter Text Analysis

Note: This code is licensed under Creative Commons (CC BY-SA).

You are free to...:
- Share ? copy and redistribute the material in any medium or format
- Adapt ? remix, transform, and build upon the material
- ...for any purpose, even commercially. 

For attribution, please link to my blog: https://medium.com/zareen-farooqui

Import libraries:

In [4]:
import io
import unicodedata
import string
from sets import Set
from collections import defaultdict
import requests
import json

Import files from S3 into this notebook(only when connecting to a new cluster).

In [6]:
files2Read = ["hp1.txt", "hp2.txt", "hp3.txt", "hp4.txt", "hp5.txt", "hp6.txt", "hp7.txt", "positive.txt", "negative.txt"]

for file in files2Read:
  myRDD = sc.textFile("dbfs:/mnt/zareenprojects/hp/%s" % file)
  fullTextList = myRDD.collect()

  f = open(file,"w")
  for item in fullTextList:
    f.write("%s\n" % item.encode('utf8'))
  f.close()

Check that all files are loaded in the notebook. (magic command so it must run in a seperate cell)

In [8]:
%sh ls -l

#### Sentiment Class

Make a Sentiment List object for the positive and negative word lists.

In [11]:
class SentimentList():
  #open and read file when initializing book object
  def __init__(self, file):
    self.words = io.open(file, 'rt', encoding='utf-8', newline='\n').readlines()
    self.BoW = []
    for line in self.words:
      lineStripped= line.rstrip() 
      # Research: https://stackoverflow.com/questions/8152820/how-to-do-string-formatting-with-unicode-emdash
      linedStrippedDashRemoved = lineStripped.replace(u"\u2014", ' ') 
      lineNormalizedLowered = unicodedata.normalize('NFKD', linedStrippedDashRemoved).encode('ascii','ignore').lower().strip()
      lineNoPunc = lineNormalizedLowered.translate(None, string.punctuation).split()
      for word in lineNoPunc:
        self.BoW.append(word)

In [12]:
pos_List = SentimentList("positive.txt")
neg_List = SentimentList("negative.txt")

In [13]:
# pass sentiment analyis pos_List.BoW and neg_List.BoW
print "+ : ", pos_List.BoW[0:10]
print "- : ", neg_List.BoW[0:10]

#### Book Class

In [15]:
class Book():
  # open and read file when initializing book object
  # create empty lists for BoWs, must reset to zero/empty when calling method
  def __init__(self, file):
    self.book = io.open(file, 'rt', encoding='utf-8', newline='\n').readlines()
    self.BoW, self.BoWnSW = [], []
    self.wordCount = []
    self.periodCount, self.questionCount, self.exclamationCount = 0, 0, 0
    
    #make BoW
    for line in self.book:    
      lineStripped= line.rstrip() 
      # Research: https://stackoverflow.com/questions/8152820/how-to-do-string-formatting-with-unicode-emdash
      linedStrippedDashRemoved = lineStripped.replace(u"\u2014", ' ') 
      lineNormalizedLowered = unicodedata.normalize('NFKD', linedStrippedDashRemoved).encode('ascii','ignore').lower().strip()
      lineNoPunc = lineNormalizedLowered.translate(None, string.punctuation).split()
      for word in lineNoPunc:
        self.BoW.append(word)
        
    stopwords = ['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'both', 'bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant', 'con', 'could', 'couldnt', 'cry', 'de', 'describe', 'detail', 'do', 'done', 'down', 'due', 'during', 'each', 'eg', 'either', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except','few', 'fill', 'find', 'for', 'former', 'formerly', 'found', 'from', 'front', 'full', 'further', 'get', 'give', 'go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'however', 'i', 'ie', 'if', 'in', 'inc', 'indeed', 'interest', 'into', 'is', 'it', 'its', 'itself', 'keep', 'last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made', 'many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', 'my', 'myself', 'namely', 'neither', 'never', 'nevertheless', 'next', 'no', 'nobody', 'none', 'noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'once', 'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise','our', 'ours', 'ourselves', 'out', 'over', 'own', 'part', 'per', 'perhaps', 'please', 'put', 'rather', 're', 'same', 'see', 'seem', 'seemed', 'seeming', 'seems', 'serious', 'several', 'she', 'should', 'show', 'since', 'sincere', 'so', 'some', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere', 'still', 'such', 'system', 'take', 'than', 'that', 'the', 'their', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these', 'they', 'this', 'those', 'though', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'top', 'toward', 'towards', 'un', 'under', 'until', 'up', 'upon', 'us', 'very', 'via', 'was', 'we', 'well', 'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves']      
  
    #make BoW with no stopwords
    for line in self.book:    
      lineStripped= line.rstrip() 
      # Research: https://stackoverflow.com/questions/8152820/how-to-do-string-formatting-with-unicode-emdash
      linedStrippedDashRemoved = lineStripped.replace(u"\u2014", ' ') 
      lineNormalizedLowered = unicodedata.normalize('NFKD', linedStrippedDashRemoved).encode('ascii','ignore').lower().strip()
      lineNoPunc = lineNormalizedLowered.translate(None, string.punctuation).split()
      for word in lineNoPunc:
        if word not in stopwords:
            self.BoWnSW.append(word)

    #punctuation analysis
    punctBoW = []
    for line in self.book:    
      lineStripped= line.rstrip() 
      # Research: https://stackoverflow.com/questions/8152820/how-to-do-string-formatting-with-unicode-emdash
      linedStrippedDashRemoved = lineStripped.replace(u"\u2014", ' ') 
      lineNormalizedLowered = unicodedata.normalize('NFKD',linedStrippedDashRemoved).encode('ascii','ignore').lower().strip().split()
      for each in lineNormalizedLowered:
        punctBoW.append(each)

    wordCounter = 0
    self.periodLIST, self.questionLIST, self.exclamationLIST = [], [], []

    for word in punctBoW:
      wordCounter += 1
      for char in word:
        if char == ".":
          if word[-4:] == "mr." or word[-4:] == "mrs.":
            continue
          elif word[-3:] == "...":
            continue
          elif word[-1:] == '"':
            wordCounter = 0
          else:
            self.periodCount += 1
            self.periodLIST.append(wordCounter)
            wordCounter = 0
        elif char == "?":
          self.questionCount += 1
          self.questionLIST.append(wordCounter)
          wordCounter = 0
        elif char == "!":
          self.exclamationCount += 1
          self.exclamationLIST.append(wordCounter)
          wordCounter = 0
        else:
          continue
  
                               
  # copy dictionary into a list and sort by descending values
  def copyDictSortDes(self, DictToCopy, length = None):
    if length is None:
      length = len(DictToCopy) 
    copy, listToReturn = [], []
    for x, y in DictToCopy.items():
       copy.append((y,x))
    copy = sorted(copy, reverse = True)
    for x in copy[0:length]:
      listToReturn.append((x[1], x[0]))
    return listToReturn 
  
  # sort single words by frequency
  def getWordCount(self, listToCount = None):
    if listToCount is None:
      if not self.BoWnSW:
        listToCount = self.BoW
      else:
        listToCount = self.BoWnSW
    wordCount={}
    for word in listToCount:
      if word not in wordCount:
        wordCount[word] = 1
      else:
        wordCount[word] += 1
    self.wordCount = self.copyDictSortDes(wordCount)
    
  # generate ngrams from one word to 6 words
  def getNgram(self, n): 
    nGram_list = []
    if n == 1:
      unigram = self.getWordCount(self.BoW)
      return self.wordCount
    elif n == 2:
      nGram_list = zip(self.BoW, self.BoW[1:])
    elif n == 3:
      nGram_list = zip(self.BoW, self.BoW[1:], self.BoW[2:])
    elif n == 4:
      nGram_list = zip(self.BoW, self.BoW[1:], self.BoW[2:], self.BoW[3:])
    elif n == 5:
      nGram_list = zip(self.BoW, self.BoW[1:], self.BoW[2:], self.BoW[3:], self.BoW[4:])
    elif n == 6:
      nGram_list = zip(self.BoW, self.BoW[1:], self.BoW[2:], self.BoW[3:], self.BoW[4:], self.BoW[5:])
    else:
      print "Error: Please enter an n-value from 1 - 6."
      
    nGram_dict = {}
    for nGram in nGram_list:
      if nGram in nGram_dict:
        nGram_dict[nGram] += 1
      else:
        nGram_dict[nGram] = 1
    nGram = self.copyDictSortDes(nGram_dict, 50)
    return nGram
  
  
  # returns number of positive and negative words in book and lists out those words
  def sentiment(self, pos_List, neg_List):
    posCount, negCount = 0, 0
    posWordsInBook, negWordsInBook = {}, {}
    
    listToIterate = []
    if not self.BoWnSW:
        listToIterate = self.BoW
    else:
        listToIterate = self.BoWnSW

    for word in listToIterate:
      if word in pos_List:
        posCount += 1
        if word not in posWordsInBook:
          posWordsInBook[word] = 1
        else:
          posWordsInBook[word] += 1
      elif word in neg_List:
        negCount += 1
        if word not in negWordsInBook:
          negWordsInBook[word] = 1
        else:
          negWordsInBook[word] += 1
      else:
        continue

    print "Positive word count is:", posCount
    print "Negative word count is:", negCount
    print "The top 30 occuring positive words are: "
    print self.copyDictSortDes(posWordsInBook, 30)
    print "The top 30 occuring negative words are: "
    print self.copyDictSortDes(negWordsInBook, 30)
    

Make a Book object for each book in the series.

In [17]:
hp1 = Book("hp1.txt")
hp2 = Book("hp2.txt")
hp3 = Book("hp3.txt")
hp4 = Book("hp4.txt")
hp5 = Book("hp5.txt")
hp6 = Book("hp6.txt")
hp7 = Book("hp7.txt")

##Word Counts

Q) How many total words in each book?

In [20]:
print "HP 1: ", len(hp1.BoW)
print "HP 2: ", len(hp2.BoW)
print "HP 3: ", len(hp3.BoW)
print "HP 4: ", len(hp4.BoW)
print "HP 5: ", len(hp5.BoW)
print "HP 6: ", len(hp6.BoW)
print "HP 7: ", len(hp7.BoW)

Q)How many unique words in each book? (each word only counted once)

In [22]:
hp1.getWordCount()
print "HP 1: ", len(hp1.wordCount)
hp2.getWordCount()
print "HP 2: ", len(hp2.wordCount)
hp3.getWordCount()
print "HP 3: ", len(hp3.wordCount)
hp4.getWordCount()
print "HP 4: ", len(hp4.wordCount)
hp5.getWordCount()
print "HP 5: ", len(hp5.wordCount)
hp6.getWordCount()
print "HP 6: ", len(hp6.wordCount)
hp7.getWordCount()
print "HP 7: ", len(hp7.wordCount)

In [23]:
displayHTML("""
<html>
<head>
    
    <script type="text/javascript" src="https://www.gstatic.com/charts/loader.js"></script>
    <script type="text/javascript">   
  
    google.charts.load('current', {packages: ['corechart', 'bar']});
    google.charts.setOnLoadCallback(drawColColors);
    
    function drawColColors() {
      var data = new google.visualization.DataTable();
      data.addColumn('string', 'HP Book');
      data.addColumn('number', 'Total Words');
      data.addColumn('number', 'Unique Words');

      data.addRows([
        ['HP1', 77744, 5687],
        ['HP2', 85551, 6830],
        ['HP3', 107671, 7433],
        ['HP4', 191667, 10166],
        ['HP5', 256982, 12624],
        ['HP6', 169309, 10528],
        ['HP7', 197697, 11387],       
      ]);  

      var options = {
        colors: ['#add8e6', '#e6bbad'],
      };

      var chart = new google.visualization.ColumnChart(document.getElementById('chart_div'));
      chart.draw(data, options);
    }
    
    </script>
</head>

 <body>
  <div id="chart_div" style="width: 900px; height: 500px;"></div>
 </body>
</html>
""")

## Punctuation Analysis

Q) How many period marks in each book?

In [26]:
print "HP 1: ", hp1.periodCount
print "HP 2: ", hp2.periodCount
print "HP 3: ", hp3.periodCount
print "HP 4: ", hp4.periodCount
print "HP 5: ", hp5.periodCount
print "HP 6: ", hp6.periodCount
print "HP 7: ", hp7.periodCount

Q) How long are the average declarative and imperative sentences in each book? (sentences that end with a period mark)

In [28]:
print "HP 1: ", sum (hp1.periodLIST)/(len(hp1.periodLIST))
print "HP 2: ", sum (hp2.periodLIST)/(len(hp2.periodLIST))
print "HP 3: ", sum (hp3.periodLIST)/(len(hp3.periodLIST))
print "HP 4: ", sum (hp4.periodLIST)/(len(hp4.periodLIST))
print "HP 5: ", sum (hp5.periodLIST)/(len(hp5.periodLIST))
print "HP 6: ", sum (hp6.periodLIST)/(len(hp6.periodLIST))
print "HP 7: ", sum (hp7.periodLIST)/(len(hp7.periodLIST))

Q) How many question marks in each book?

In [30]:
print "HP 1: ", hp1.questionCount
print "HP 2: ", hp2.questionCount
print "HP 3: ", hp3.questionCount
print "HP 4: ", hp4.questionCount
print "HP 5: ", hp5.questionCount
print "HP 6: ", hp6.questionCount
print "HP 7: ", hp7.questionCount

Q) How long are the average interrogative sentences in each book? (sentences that end with a question mark)

In [32]:
print "HP 1: ", sum (hp1.questionLIST)/(len(hp1.questionLIST))
print "HP 2: ", sum (hp2.questionLIST)/(len(hp2.questionLIST))
print "HP 3: ", sum (hp3.questionLIST)/(len(hp3.questionLIST))
print "HP 4: ", sum (hp4.questionLIST)/(len(hp4.questionLIST))
print "HP 5: ", sum (hp5.questionLIST)/(len(hp5.questionLIST))
print "HP 6: ", sum (hp6.questionLIST)/(len(hp6.questionLIST))
print "HP 7: ", sum (hp7.questionLIST)/(len(hp7.questionLIST))

Q) How many exclamation marks in each book?

In [34]:
print "HP 1: ", hp1.exclamationCount
print "HP 2: ", hp2.exclamationCount
print "HP 3: ", hp3.exclamationCount
print "HP 4: ", hp4.exclamationCount
print "HP 5: ", hp5.exclamationCount
print "HP 6: ", hp6.exclamationCount
print "HP 7: ", hp7.exclamationCount

Q) How long are the average exclamatory sentences in each book? (sentences that end with an exclamation mark)

In [36]:
print "HP 1: ", sum (hp1.exclamationLIST)/(len(hp1.exclamationLIST))
print "HP 2: ", sum (hp2.exclamationLIST)/(len(hp2.exclamationLIST))
print "HP 3: ", sum (hp3.exclamationLIST)/(len(hp3.exclamationLIST))
print "HP 4: ", sum (hp4.exclamationLIST)/(len(hp4.exclamationLIST))
print "HP 5: ", sum (hp5.exclamationLIST)/(len(hp5.exclamationLIST))
print "HP 6: ", sum (hp6.exclamationLIST)/(len(hp6.exclamationLIST))
print "HP 7: ", sum (hp7.exclamationLIST)/(len(hp7.exclamationLIST))

In [37]:
displayHTML("""
<html>
<head>
    
    <script type="text/javascript" src="https://www.gstatic.com/charts/loader.js"></script>
    <script type="text/javascript">   
  
    google.charts.load('current', {packages: ['corechart', 'bar']});
    google.charts.setOnLoadCallback(drawColColors);
    
    function drawColColors() {
      var data = new google.visualization.DataTable();
      data.addColumn('string', 'HP Book');
      data.addColumn('number', 'Period Marks');
      data.addColumn('number', 'Question Marks');
       data.addColumn('number', 'Exclamation Marks');

      data.addRows([
        ['HP1', 5190, 759, 478],
        ['HP2', 5307, 713, 553],
        ['HP3', 6647, 1063, 1039],
        ['HP4', 10756, 1820, 1392],
        ['HP5', 13596, 2609, 1562],
        ['HP6', 8962, 1744, 1057],
        ['HP7', 10831, 2125, 1546],
        
      ]);

      var options = {
        colors: ['#009d00', '#8000ff', '#ffa500'],
      };

      var chart = new google.visualization.ColumnChart(document.getElementById('chart_div'));
      chart.draw(data, options);
    }
    
    </script>
</head>

 <body>
  <div id="chart_div" style="width: 900px; height: 500px;"></div>
 </body>
</html>
""")

In [38]:
displayHTML("""
<html>
<head>
    
    <script type="text/javascript" src="https://www.gstatic.com/charts/loader.js"></script>
    <script type="text/javascript">   
  
    google.charts.load('current', {packages: ['corechart', 'bar']});
    google.charts.setOnLoadCallback(drawColColors);
    
    function drawColColors() {
      var data = new google.visualization.DataTable();
      data.addColumn('string', 'HP Book');
      data.addColumn('number', 'Declarative or Imperative Sentences');
      data.addColumn('number', 'Interrogative Sentences');
       data.addColumn('number', 'Exclamatory Sentences');

      data.addRows([
        ['HP1', 13, 8, 7],
        ['HP2', 14, 8, 8],
        ['HP3', 13, 8, 7],
        ['HP4', 15, 9, 8],
        ['HP5', 15, 9, 10],
        ['HP6', 15, 9, 10],
        ['HP7', 15, 9, 9],   
      ]);
      
      var options = {
        colors:  ['#009d00', '#8000ff', '#ffa500'],
          vAxis: {
          title: 'Words in Sentence'
        }
      };

      var chart = new google.visualization.ColumnChart(document.getElementById('chart_div'));
      chart.draw(data, options);
    }
    
    </script>
</head>

 <body>
  <div id="chart_div" style="width: 900px; height: 500px;"></div>
 </body>
</html>
""")

##Ngrams

Q) What are the top 50 most common Ngrams from bigrams to sixgrams for each book?

In [41]:
hp1.getNgram(2)

In [42]:
hp1.getNgram(3)

In [43]:
hp1.getNgram(4)

In [44]:
hp1.getNgram(5)

In [45]:
hp1.getNgram(6)

In [46]:
hp2.getNgram(2)

In [47]:
hp2.getNgram(3)

In [48]:
hp2.getNgram(4)

In [49]:
hp2.getNgram(5)

In [50]:
hp2.getNgram(6)

In [51]:
hp3.getNgram(2)

In [52]:
hp3.getNgram(3)

In [53]:
hp3.getNgram(4)

In [54]:
hp3.getNgram(5)

In [55]:
hp3.getNgram(6)

In [56]:
hp4.getNgram(2)

In [57]:
hp4.getNgram(3)

In [58]:
hp4.getNgram(4)

In [59]:
hp4.getNgram(5)

In [60]:
hp4.getNgram(6)

In [61]:
hp5.getNgram(2)

In [62]:
hp5.getNgram(3)

In [63]:
hp5.getNgram(4)

In [64]:
hp5.getNgram(5)

In [65]:
hp5.getNgram(6)

In [66]:
hp6.getNgram(2)

In [67]:
hp6.getNgram(3)

In [68]:
hp6.getNgram(4)

In [69]:
hp6.getNgram(5)

In [70]:
hp6.getNgram(6)

In [71]:
hp7.getNgram(2)

In [72]:
hp7.getNgram(3)

In [73]:
hp7.getNgram(4)

In [74]:
hp7.getNgram(5)

In [75]:
hp7.getNgram(6)

In [76]:
displayHTML("""
  <html>
  <head>
    <script type="text/javascript" src="https://www.gstatic.com/charts/loader.js"></script>
    <script type="text/javascript">
      google.charts.load('current', {'packages':['corechart']});
      google.charts.setOnLoadCallback(drawChart);

      function drawChart() {
        var data = google.visualization.arrayToDataTable([
          ['Book', 'Harry and Ron', 'Harry and Hermione', 'Ron and Hermione'],
          ['HP 1',  36,       13, 32],
          ['HP 2',  61,      9, 51],
          ['HP 3',  40,        41, 92],
          ['HP 4',  61,      14, 125],
          ['HP 5',  70,       42, 119],
          ['HP 6',  35,       8, 77],
          ['HP 7',  24,       20 , 139]
        ]);

        var options = {
          curveType: 'function',
          legend: { position: 'top right' }
        };

        var chart = new google.visualization.LineChart(document.getElementById('curve_chart'));

        chart.draw(data, options);
      }
    </script>
  </head>
  <body>
    <div id="curve_chart" style="width: 900px; height: 500px"></div>
  </body>
</html>
""")

In [77]:
displayHTML("""
  <html>
  <head>
    <script type="text/javascript" src="https://www.gstatic.com/charts/loader.js"></script>
    <script type="text/javascript">
      google.charts.load('current', {'packages':['corechart']});
      google.charts.setOnLoadCallback(drawChart);

      function drawChart() {
        var data = google.visualization.arrayToDataTable([
          ['Book', 'Defense Against the Dark Arts', 'Potions', 'Transfiguration', 'History of Magic'],
          ['HP 1',  3,       15, 5, 4],
          ['HP 2',  9,      23, 9, 5],
          ['HP 3',  22,        11, 9, 7],
          ['HP 4',  14,      17, 11, 6],
          ['HP 5',  46,       35, 24, 11],
          ['HP 6',  28,       70, 18, 5],
          ['HP 7',  4,       5 , 5, 10]
        ]);


        var options = {
          curveType: 'function',
          legend: { position: 'top right' }
        };

        var chart = new google.visualization.LineChart(document.getElementById('curve_chart'));

        chart.draw(data, options);
      }
    </script>
  </head>
  <body>
    <div id="curve_chart" style="width: 900px; height: 500px"></div>
  </body>
</html>
""")

In [78]:
displayHTML("""
  <html>
  <head>
    <script type="text/javascript" src="https://www.gstatic.com/charts/loader.js"></script>
    <script type="text/javascript">
      google.charts.load('current', {'packages':['corechart']});
      google.charts.setOnLoadCallback(drawChart);

      function drawChart() {
        var data = google.visualization.arrayToDataTable([
          ['Book', 'Hogwarts', 'Diagon Alley', 'Ministry of Magic', 'Hogsmeade'],
          ['HP 1',  79,       8, 6, 0],
          ['HP 2',  93,      6, 5, 0],
          ['HP 3',  92,        11, 25, 61],
          ['HP 4',  188,      5, 27, 22],
          ['HP 5',  155,       7, 66,  26],
          ['HP 6',  135,       13, 16, 27],
          ['HP 7',  134,       8 , 7, 12]
        ]);


        var options = {
          curveType: 'function',
          legend: { position: 'top right' }
        };

        var chart = new google.visualization.LineChart(document.getElementById('curve_chart'));

        chart.draw(data, options);
      }
    </script>
  </head>
  <body>
    <div id="curve_chart" style="width: 900px; height: 500px"></div>
  </body>
</html>
""")

##Sentiment Analysis

In [80]:
print "hp1 sentiment anaysis: \n", hp1.sentiment(pos_List.BoW, neg_List.BoW), "\n \n"
print "hp2 sentiment anaysis: \n", hp2.sentiment(pos_List.BoW, neg_List.BoW), "\n \n"
print "hp3 sentiment anaysis: \n", hp3.sentiment(pos_List.BoW, neg_List.BoW), "\n \n"
print "hp4 sentiment anaysis: \n", hp4.sentiment(pos_List.BoW, neg_List.BoW), "\n \n"
print "hp5 sentiment anaysis: \n", hp5.sentiment(pos_List.BoW, neg_List.BoW), "\n \n"
print "hp6 sentiment anaysis: \n", hp6.sentiment(pos_List.BoW, neg_List.BoW), "\n \n"
print "hp7 sentiment anaysis: \n", hp7.sentiment(pos_List.BoW, neg_List.BoW)

In [81]:
displayHTML("""
<html>
<head>
    
    <script type="text/javascript" src="https://www.gstatic.com/charts/loader.js"></script>
    <script type="text/javascript">   
  
    google.charts.load('current', {packages: ['corechart', 'bar']});
    google.charts.setOnLoadCallback(drawColColors);
    
    function drawColColors() {
      var data = new google.visualization.DataTable();
      data.addColumn('string', 'HP Book');
      data.addColumn('number', 'Positive Words');
      data.addColumn('number', 'Negative Words');
      

      data.addRows([
        ['HP1', 2161, 2265],
        ['HP2', 2561, 2761],
        ['HP3', 2981, 3779],
        ['HP4', 5665, 6150],
        ['HP5', 7312, 8518],
        ['HP6', 5204, 5596],
        ['HP7', 5616, 6834],
        
      ]);

      var options = {
        colors: ['#33ac71', '#ac3533'],
        vAxis: {
          title: 'Number of words'
        }
      };

      var chart = new google.visualization.ColumnChart(document.getElementById('chart_div'));
      chart.draw(data, options);
    }
    
    </script>
</head>

 <body>
  <div id="chart_div" style="width: 900px; height: 500px;"></div>
 </body>
</html>
""")

## Character Relationship Analysis

Populate HP characters from WikiData

In [84]:
def populateCharacters():
  # run query to get all characters from Wikidata API
  rawCharLIST = []
  hpCharURL = "https://query.wikidata.org/sparql?query= SELECT DISTINCT ?item ?itemLabel WHERE { {?item wdt:P31 ?sub1 . ?sub1 (wdt:P279|wdt:P131)* wd:Q95074 . ?item wdt:P1080 ?sub2 . ?sub2 (wdt:P279|wdt:P131)* wd:Q5410773 } SERVICE wikibase:label { bd:serviceParam wikibase:language 'en' . }} &format = JSON"
  headers = {"Accept" : "application/json"}
  r = requests.get(hpCharURL, headers=headers)
  for each in r.json()['results']['bindings']:
    rawCharLIST.append(each['itemLabel']['value']) 
  
  # remove Qcode, and put lower-case names into a list noQcodeChar
  noQcodeChar = []
  for name in rawCharLIST:
    if any(char.isdigit() for char in name) == True:
      continue
    else:
      noQcodeChar.append(name.lower())
  
  #remove punctuation, create new list hpChar
  hpChar = []  
  hpChar = [''.join(c for c in s if c not in string.punctuation) for s in noQcodeChar]
  return hpChar

In [85]:
CharNames = populateCharacters()
for each in CharNames:
  print each

#### Character class

In [87]:
class Character():
  def __init__(self, name):
    self.wikiName = name

    #List of strings of possible nicknames or aliases for each character
    #note for revisiting code: make sure to add charName+s (harry, harrys)
    self.aliasesLIST = []

    #each item key is a specific book (1-7), the value is key all the appearances
    #research: http://sopython.com/canon/77/collections-defaultdict-s-first-argument-must-be-callable/
    self.appearancesDICT = defaultdict(lambda: [])  
    self.relationshipScores2TargetChar = defaultdict(lambda: [])
    self.uniqueInstances2TargetChar = defaultdict(lambda: int)
    self.allInstances2TargetChar = defaultdict(lambda: int)
    
    
  def getInstances(self, allBoWList):
    self.appearancesDICT.clear()
    bookNum = 1
    for each in allBoWList: 
      for idx, word in enumerate(each):
        if word in self.aliasesLIST:
          self.appearancesDICT[bookNum].append(idx)
      bookNum += 1

  def addName(self, possibleName):
    #add/append the name to possibleNamesLIST
    self.aliasesLIST.append(possibleName)
    

In [88]:
hpBookList = [hp1.BoW, hp2.BoW, hp3.BoW, hp4.BoW, hp5.BoW, hp6.BoW, hp7.BoW]

Make Character objects for analysis

In [90]:
harry = Character("harry potter")
harry.addName("harry")
harry.getInstances(hpBookList)

ron = Character("ron weasley")
ron.addName("ron")
ron.getInstances(hpBookList)

hermione = Character("hermione granger")
hermione.addName("hermione")
hermione.getInstances(hpBookList)

ginny = Character("ginny weasley")
ginny.addName("ginny")
ginny.getInstances(hpBookList)

dumbledore = Character("albus dumbledore")
dumbledore.addName("dumbledore")
dumbledore.getInstances(hpBookList)

sirius = Character("sirius black i")
sirius.addName("sirius")
sirius.getInstances(hpBookList)

snape = Character("severus snape")
snape.addName("snape")
snape.getInstances(hpBookList)

luna = Character("luna lovegood")
luna.addName("luna")
luna.getInstances(hpBookList)

hagrid = Character("rubeus hagrid")
hagrid.addName("hagrid")
hagrid.getInstances(hpBookList)

dobby = Character("dobby")
dobby.addName("dobby")
dobby.getInstances(hpBookList)

dudley = Character("dudley dursley")
dudley.addName("dudley")
dudley.getInstances(hpBookList)

voldemort = Character("tom riddle")
voldemort.addName("voldemort")
voldemort.getInstances(hpBookList)

In [91]:
# george weasley
# fred weasley

fred = Character("fred weasley")
fred.addName("fred")
fred.getInstances(hpBookList)

george = Character("george weasley")
george.addName("george")
george.getInstances(hpBookList)

Q) How many total times do the character names below appear in throughout the series?

In [93]:
characters_to_analyze = [harry, ron, hermione, ginny, dumbledore, sirius, snape, luna, hagrid, dobby, dudley, voldemort]

def totalAppearances(char_name):
  totalAppearances = 0
  for each in char_name.appearancesDICT:
    totalAppearances = totalAppearances + len(char_name.appearancesDICT[each])

  print char_name.aliasesLIST[0], "appears",  totalAppearances, "times."
  

for each in characters_to_analyze:
  totalAppearances(each)

Determine character relationship from viewpointChar to targetChar for each book.

In [95]:
def charRelationship(viewpointChar, targetChar):
  bookNumber = 1
  
  viewpointChar.relationshipScores2TargetChar[targetChar.wikiName] = []
  
  while bookNumber < 8:
    instances = 0
    v2tLIST = []

    for ind in viewpointChar.appearancesDICT[bookNumber]:
      indlow = ind - 40
      indhigh = ind + 40
      x=0

      for index in targetChar.appearancesDICT[bookNumber]: 
        x += 1
        if (indlow <= index <= indhigh):
          instances += 1
          v2tLIST.append([ind,index])

      upperBound = float(len(viewpointChar.appearancesDICT[bookNumber]))
      v2tUniqueSET = Set([])
      for each in v2tLIST:
        v2tUniqueSET.add(each[0])
        

    scoreToAppend = "%.2f" % ((len(v2tUniqueSET) / upperBound)*100)
    #Now add the metrics to the viewpointChar's data structure about his/her relationship to targetChar
    viewpointChar.relationshipScores2TargetChar[targetChar.wikiName].append(scoreToAppend)
    viewpointChar.uniqueInstances2TargetChar[targetChar.wikiName] = len(v2tUniqueSET)
    viewpointChar.allInstances2TargetChar[targetChar.wikiName] = instances
    bookNumber += 1
    


In [96]:
charRelationship(harry, ron)
charRelationship(ron, harry)
charRelationship(harry, hermione)
charRelationship(hermione, harry)
charRelationship(harry, ginny)
charRelationship(ginny, harry)
charRelationship(harry, dumbledore)
charRelationship(dumbledore, harry)
charRelationship(harry, sirius)
charRelationship(sirius, harry)
charRelationship(harry, snape)
charRelationship(snape, harry)
charRelationship(harry, hagrid)
charRelationship(hagrid, harry)
charRelationship(harry, dudley)
charRelationship(dudley, harry)
charRelationship(harry, voldemort)
charRelationship(voldemort, harry)

In [97]:
charRelationship(ron, hermione)
charRelationship(hermione, ron)
charRelationship(ron, ginny)
charRelationship(ginny, ron)
charRelationship(hermione, ginny)
charRelationship(ginny, hermione)

In [98]:
charRelationship(fred, george)
charRelationship(george, fred)

Q) What is Harry's relationship score to other major characters?

In [100]:
print "harry to ron", harry.relationshipScores2TargetChar[ron.wikiName]
print "ron to harry", ron.relationshipScores2TargetChar[harry.wikiName] 
print "harry to hermione", harry.relationshipScores2TargetChar[hermione.wikiName]
print "hermione to harry", hermione.relationshipScores2TargetChar[harry.wikiName]
print "harry to ginny", harry.relationshipScores2TargetChar[ginny.wikiName]
print "ginny to harry", ginny.relationshipScores2TargetChar[harry.wikiName]
print "harry to dumbledore", harry.relationshipScores2TargetChar[dumbledore.wikiName]
print "dumbledore to harry", dumbledore.relationshipScores2TargetChar[harry.wikiName]
print "harry to sirius", harry.relationshipScores2TargetChar[sirius.wikiName]
print "sirius to harry", sirius.relationshipScores2TargetChar[harry.wikiName]
print "harry to snape", harry.relationshipScores2TargetChar[snape.wikiName]
print "snape to harry", snape.relationshipScores2TargetChar[harry.wikiName]
print "harry to hagrid", harry.relationshipScores2TargetChar[hagrid.wikiName]
print "hagrid to harry", hagrid.relationshipScores2TargetChar[harry.wikiName]
print "harry to dudley", harry.relationshipScores2TargetChar[dudley.wikiName]
print "dudley to harry", dudley.relationshipScores2TargetChar[harry.wikiName]
print "harry to voldemort", harry.relationshipScores2TargetChar[voldemort.wikiName]
print "voldemort to harry", voldemort.relationshipScores2TargetChar[harry.wikiName]

In [101]:
print "ron to hermione", ron.relationshipScores2TargetChar[hermione.wikiName]
print "hermione to ron", hermione.relationshipScores2TargetChar[ron.wikiName]
print "ron to ginny", ron.relationshipScores2TargetChar[ginny.wikiName]
print "ginny to ron", ginny.relationshipScores2TargetChar[ron.wikiName]
print "hermione to ginny", hermione.relationshipScores2TargetChar[ginny.wikiName]
print "ginny to hermione", ginny.relationshipScores2TargetChar[hermione.wikiName]
print "fred to george", fred.relationshipScores2TargetChar[george.wikiName]
print "george to fred", george.relationshipScores2TargetChar[fred.wikiName]


Use charRelationship function to determine one overall score for relationship score.

In [103]:
def charRelationshipOverall(viewpointChar, targetChar):
  sumScore = 0
  for each in viewpointChar.relationshipScores2TargetChar[targetChar.wikiName]:
    sumScore = sumScore + float(each)
  
  overallScore = "%.2f" %(sumScore/ 7.0)
  return overallScore

In [104]:
print "overall harry to ron",charRelationshipOverall(harry, ron)
print "overall ron to harry",charRelationshipOverall(ron, harry)
print "overall harry to hermione",charRelationshipOverall(harry, hermione)
print "overall hermione to harry",charRelationshipOverall(hermione, harry)
print "overall harry to ginny",charRelationshipOverall(harry, ginny)
print "overall ginny to harry",charRelationshipOverall(ginny, harry)
print "overall harry to dumbledore",charRelationshipOverall(harry, dumbledore)
print "overall dumbledore to harry",charRelationshipOverall(dumbledore, harry)
print "overall harry to sirius",charRelationshipOverall(harry, sirius)
print "overall sirius to harry",charRelationshipOverall(sirius, harry)
print "overall harry to snape",charRelationshipOverall(harry, snape)
print "overall snape to harry",charRelationshipOverall(snape, harry)
print "overall harry to hagrid",charRelationshipOverall(harry, hagrid)
print "overall hagrid to harry",charRelationshipOverall(hagrid, harry)
print "overall harry to dudley",charRelationshipOverall(harry, dudley)
print "overall dudley to harry",charRelationshipOverall(dudley, harry)
print "overall harry to voldemort",charRelationshipOverall(harry, voldemort)
print "overall voldemort to harry",charRelationshipOverall(voldemort, harry)
