In [1]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import math
import itertools
from collections import OrderedDict

In [2]:
SHEETNAME="Final_Booklist"
ZERO=0
booklist_filepath = "/Users/surajshashidhar/git/fiction/Final_Booklist.xlsx"
epubs_filepath = "/Users/surajshashidhar/git/fiction/testing_epubs_extracted"

In [3]:
def read_booklist_and_preprocess(booklist_filepath):
        df = pd.read_excel(io=booklist_filepath, sheet_name=SHEETNAME,header=ZERO)
        print("Reading and printing book list file")
        print(df.head(10))
        df.drop_duplicates(subset = ["bid"], keep = "last", inplace = True)
        tmp_dict = df.to_dict(orient="list")
        book_lang_dict = {}
        for pgid, name, lang in list(zip(tmp_dict["bid"], tmp_dict["bname"], tmp_dict["blang"])):
            book_lang_dict["pg" + str(pgid)] = [pgid, lang, name]
        
        x = list(book_lang_dict.keys())
        print(x[1:3])
        print("==== ========= ======== ======== ========== ========= ")
        print()
        return [book_lang_dict, df];

In [4]:
book_lang_dict, booklist_df = read_booklist_and_preprocess(booklist_filepath)

Reading and printing book list file
   bid                          bname blang  auth_id              auth_name  \
0   78             Tarzan of the Apes    en   2429.0  Burroughs, Edgar Rice   
1   81           The Return of Tarzan    en   2429.0  Burroughs, Edgar Rice   
2   85           The Beasts of Tarzan    en   2429.0  Burroughs, Edgar Rice   
3   90              The Son of Tarzan    en   2429.0  Burroughs, Edgar Rice   
4   92  Tarzan and the Jewels of Opar    en   2429.0  Burroughs, Edgar Rice   
5  106         Jungle Tales of Tarzan    en   2429.0  Burroughs, Edgar Rice   
6  123            At the Earth's Core    en   2429.0  Burroughs, Edgar Rice   
7  331                     The Mucker    en   2429.0  Burroughs, Edgar Rice   
8  363             The Oakdale Affair    en   2429.0  Burroughs, Edgar Rice   
9  364                   The Mad King    en   2429.0  Burroughs, Edgar Rice   

    sid        sub   cat  
0  79.0  Adventure  LCSH  
1  79.0  Adventure  LCSH  
2  79.0  Adve

In [5]:
booklist_df.describe()

Unnamed: 0,auth_id,sid
count,2182.0,2151.0
mean,2609.702566,1525.330079
std,4609.988537,3492.187648
min,1.0,1.0
25%,361.25,147.5
50%,754.0,567.0
75%,2336.25,1083.0
max,25360.0,26951.0


In [6]:
#booklist_df.drop_duplicates(subset = ["bid"], keep = "last", inplace = True)

In [7]:
def read_bookpath_and_extract_pgid(epubs_filepath):
        books_path_dict = {}
        filename_pattern = re.compile("(pg)([0-9]*).*?-content(.html)")
        print(epubs_filepath)
        for root, dirs, files in os.walk(epubs_filepath):
            print("Parsing directory {} for html files".format(root))

            for file in files:
                if file.endswith(".html") and os.stat(os.path.join(root, file)).st_size > 0:
                    arr = filename_pattern.search(file)
                    if arr:
                        books_path_dict[arr.group(1) + arr.group(2)] = [file, os.path.join(root, file)]
                elif os.stat(os.path.join(root, file)).st_size == 0:
                    print("Empty file found: {}".format(file))
                
        return books_path_dict




In [8]:
books_path_dict = read_bookpath_and_extract_pgid(epubs_filepath)

/Users/surajshashidhar/git/fiction/testing_epubs_extracted
Parsing directory /Users/surajshashidhar/git/fiction/testing_epubs_extracted for html files


In [9]:
len(books_path_dict)

37

In [10]:
def read_html_and_strip_tags(book_paths_dict, books_lang_dict):
        text = None
        html_content = None
        books_text_dict = {}
        for pgid, vals in book_paths_dict.items():
            filepath = vals[1]
            if not os.path.isfile(filepath):
                raise Exception("File {} not found".format(filepath))
            
            elif (pgid not in books_lang_dict):
                
                print("Book {} found in filepath but not in book list file, but processing it anyway".format(pgid))
                # if not found in book list then set english as default language
                language = "en"
                name = "default"
                f = open(filepath, "r")
                html_content = f.read()            
                text = re.sub( "<.*?>", "", html_content)
                books_text_dict[pgid] = [text, language, name] 
                print("Book {} found in filepath but not in book list file, but processing it anyway".format(pgid))   
                #raise Exception("Book {} found in filepath but not in book list file".format(pgid))

            elif (pgid in book_paths_dict):
                language = books_lang_dict[pgid][1]
                name = books_lang_dict[pgid][2]
                f = open(filepath, "r")
                html_content = f.read()            
                text = re.sub("<.*?>", "", html_content)
                books_text_dict[pgid] = [text, language, name]        
        
        print("Number of html books extracted to dict: {}".format(len(books_text_dict.keys())))
        print()
        return books_text_dict;

In [11]:
books_text_dict = read_html_and_strip_tags(books_path_dict, book_lang_dict)

Number of html books extracted to dict: 37



In [12]:
#booklist_df.loc[booklist_df['bid'].isin(["13720HermanVoyage1", "29042CarolTangTale", "4763CarolLogik", "13721HermanVoyage2"])]

booklist_df.loc[booklist_df['bid'].isin(["13720", "29042", "4763", "13721"])]

Unnamed: 0,bid,bname,blang,auth_id,auth_name,sid,sub,cat
2470,13720,Mardi: and A Voyage Thither I,en,965.0,Melville| Herman,965.0,Sea and Adventure,
2471,13721,Mardi: and A Voyage Thither ||,en,966.0,Melville| Herman,966.0,Sea and Adventure,
2486,29042,A Tangled Tale,en,981.0,Carroll| Lewis,981.0,Literary,
2492,4763,The Game of Logic,en,987.0,Carroll| Lewis,987.0,Literary,


In [13]:
booklist_df.head(10)

Unnamed: 0,bid,bname,blang,auth_id,auth_name,sid,sub,cat
7,331,The Mucker,en,2429.0,"Burroughs, Edgar Rice",79.0,Adventure,LCSH
8,363,The Oakdale Affair,en,2429.0,"Burroughs, Edgar Rice",79.0,Adventure,LCSH
11,605,Pellucidar,en,2429.0,"Burroughs, Edgar Rice",79.0,Adventure,LCSH
14,7100,"Adventures of Huckleberry Finn, Chapters 01 to 05",en,249.0,"Twain, Mark",79.0,Adventure,LCSH
15,7101,"Adventures of Huckleberry Finn, Chapters 06 to 10",en,249.0,"Twain, Mark",79.0,Adventure,LCSH
16,7102,"Adventures of Huckleberry Finn, Chapters 11 to 15",en,249.0,"Twain, Mark",79.0,Adventure,LCSH
17,7103,"Adventures of Huckleberry Finn, Chapters 16 to 20",en,249.0,"Twain, Mark",79.0,Adventure,LCSH
18,7104,"Adventures of Huckleberry Finn, Chapters 21 to 25",en,249.0,"Twain, Mark",79.0,Adventure,LCSH
19,7105,"Adventures of Huckleberry Finn, Chapters 26 to 30",en,249.0,"Twain, Mark",79.0,Adventure,LCSH
20,7106,"Adventures of Huckleberry Finn, Chapters 31 to 35",en,249.0,"Twain, Mark",79.0,Adventure,LCSH


In [14]:
books_path_dict

{'pg537': ['pg537DoyleTerrorTales-content.html',
  '/Users/surajshashidhar/git/fiction/testing_epubs_extracted/pg537DoyleTerrorTales-content.html'],
 'pg1400': ['pg1400DickensGreatExp-content.html',
  '/Users/surajshashidhar/git/fiction/testing_epubs_extracted/pg1400DickensGreatExp-content.html'],
 'pg161': ['pg161SJaneAusSensSensi-content.html',
  '/Users/surajshashidhar/git/fiction/testing_epubs_extracted/pg161SJaneAusSensSensi-content.html'],
 'pg834': ['pg834DoyleMemoirsSherlk-content.html',
  '/Users/surajshashidhar/git/fiction/testing_epubs_extracted/pg834DoyleMemoirsSherlk-content.html'],
 'pg2911': ['pg2911Galsw2-content.html',
  '/Users/surajshashidhar/git/fiction/testing_epubs_extracted/pg2911Galsw2-content.html'],
 'pg11': ['pg11CarolAlice-content.html',
  '/Users/surajshashidhar/git/fiction/testing_epubs_extracted/pg11CarolAlice-content.html'],
 'pg2684': ['pg2684Galsw4-content.html',
  '/Users/surajshashidhar/git/fiction/testing_epubs_extracted/pg2684Galsw4-content.html'],

In [None]:
len(books_text_dict)

In [None]:
book_feature_vector = {"pg123": [[0.1, 0.2, 0.4, 0.5, 0.1, 0.3, 0.5, 0.8], [0.1, 0.2, 0.4, 0.5, 0.1, 0.3, 0.5, 0.8]],
                       "pg567": [[0.9, 0.2, 0.4, 0.5, 0.1, 0.3, 0.5, 0.8], [0.7, 0.2, 0.4, 0.5, 0.1, 0.3, 0.5, 0.8]],
                       "pg147": [[0.9, 0.6, 0.4, 0.5, 0.1, 0.3, 0.5, 0.8], [0.7, 0.1, 0.4, 0.5, 0.1, 0.3, 0.5, 0.8]]}

In [None]:
flattened_feature_vector = {}
for key, val in book_feature_vector.items():
    flat_list = [item for sublist in val for item in sublist]
    flattened_feature_vector[key] = flat_list
    
#flattened_feature_vector

In [None]:
cols = ["start_anger", "start_anticipation", "start_disgust", "start_fear", 
        "start_joy", "start_sadness", "start_surprise", "start_trust",
       "end_anger", "end_anticipation", "end_disgust", "end_fear", "end_joy", 
        "end_sadness", "end_surprise", "end_trust"]

In [None]:
df = pd.DataFrame.from_dict(flattened_feature_vector, orient='index',columns=cols)

In [None]:
df["bid"] = df.index
df.index = list(range(1, len(df.index) + 1))
df

In [67]:
FEATURE_VECTOR_COLS = ["start_anger", "start_anticipation", "start_disgust", "start_fear", 
        "start_joy", "start_sadness", "start_surprise", "start_trust",
       "end_anger", "end_anticipation", "end_disgust", "end_fear", "end_joy", 
        "end_sadness", "end_surprise", "end_trust", "bname","pgid"]

In [273]:
df = pd.read_csv("/Users/surajshashidhar/git/fiction/feature3_vectors_1emp_20pcnt.csv", header = 0)

In [274]:
df.head()

Unnamed: 0,start_anger,start_anticipation,start_disgust,start_fear,start_joy,start_sadness,start_surprise,start_trust,end_anger,end_anticipation,end_disgust,end_fear,end_joy,end_sadness,end_surprise,end_trust,bname,bid
0,0.080718,0.163835,0.073542,0.134179,0.10724,0.117385,0.08869,0.234411,0.107751,0.169135,0.056489,0.118105,0.091282,0.112004,0.074992,0.270243,Tales of Terror and Mystery,pg537
1,0.105634,0.162635,0.098587,0.129421,0.098261,0.125351,0.078577,0.201534,0.085802,0.194942,0.072125,0.116259,0.108292,0.129635,0.071188,0.221756,Great Expectations,pg1400
2,0.069083,0.188998,0.064824,0.067749,0.177323,0.111964,0.079864,0.240195,0.072665,0.210878,0.054606,0.115956,0.154606,0.105455,0.067166,0.218669,Sense and Sensibility,pg161
3,0.076279,0.160068,0.05905,0.115369,0.0965,0.122958,0.078869,0.290907,0.068238,0.203351,0.04219,0.132846,0.091646,0.120504,0.104439,0.236786,Memoirs of Shelock Holmes,pg834
4,0.074812,0.131955,0.040476,0.042888,0.069142,0.100031,0.058271,0.482425,0.075147,0.07743,0.041389,0.104142,0.110241,0.147065,0.154599,0.289987,Justice,pg2911


In [245]:
df.index = df.bid
tmp_dict = df.to_dict('index')

In [246]:
feature_vector_dict = {}
tmp_dict

{'pg537': {'start_anger': 0.7954918266108884,
  'start_anticipation': 1.6240868833511808,
  'start_disgust': 0.7012732613823647,
  'start_fear': 1.3906191837930562,
  'start_joy': 1.0203267362692192,
  'start_sadness': 1.236111030449743,
  'start_surprise': 0.8541990115611356,
  'start_trust': 2.3778920665824117,
  'end_anger': 0.9942957159140732,
  'end_anticipation': 1.6213286600395684,
  'end_disgust': 0.6181476117275754,
  'end_fear': 1.2385696250430551,
  'end_joy': 0.9177022102587824,
  'end_sadness': 1.2152053893409098,
  'end_surprise': 0.8184120300507278,
  'end_trust': 2.5763387576253067,
  'bname': 'Tales of Terror and Mystery',
  'bid': 'pg537'},
 'pg1400': {'start_anger': 1.0696646430571737,
  'start_anticipation': 1.6217174861655137,
  'start_disgust': 0.9516142960385306,
  'start_fear': 1.2292820057012803,
  'start_joy': 1.0212736449848028,
  'start_sadness': 1.2343043709317634,
  'start_surprise': 0.7998698822380177,
  'start_trust': 2.0722736708829186,
  'end_anger': 0

In [247]:
book_start_vector = []
book_end_vector = []
book_name = ""
book_feature_vector_dict = {}
for key, val in tmp_dict.items():
    pgid = key
    book_start_vector = [val["start_anger"], val["start_anticipation"], val["start_disgust"], val["start_fear"], val["start_joy"],
                        val["start_sadness"], val["start_surprise"], val["start_trust"]]
    book_end_vector = [val["end_anger"], val["end_anticipation"], val["end_disgust"], val["end_fear"], val["end_joy"],
                        val["end_sadness"], val["end_surprise"], val["end_trust"]]
    book_name = val["bname"]
    book_feature_vector_dict[key] = [book_start_vector, book_end_vector, book_name]

book_feature_vector_dict

{'pg537': [[0.7954918266108884,
   1.6240868833511808,
   0.7012732613823647,
   1.3906191837930562,
   1.0203267362692192,
   1.236111030449743,
   0.8541990115611356,
   2.3778920665824117],
  [0.9942957159140732,
   1.6213286600395684,
   0.6181476117275754,
   1.2385696250430551,
   0.9177022102587824,
   1.2152053893409098,
   0.8184120300507278,
   2.5763387576253067],
  'Tales of Terror and Mystery'],
 'pg1400': [[1.0696646430571737,
   1.6217174861655137,
   0.9516142960385306,
   1.2292820057012803,
   1.0212736449848028,
   1.2343043709317634,
   0.7998698822380177,
   2.0722736708829186],
  [0.869029816195338,
   1.9464510857264603,
   0.6637841816686275,
   1.2018382103350695,
   1.0932504818173114,
   1.259700041106748,
   0.7166012014416443,
   2.2493449817088007],
  'Great Expectations'],
 'pg161': [[0.6922759914157786,
   1.8582746246521518,
   0.6270345925011492,
   0.7532059507859606,
   1.7180606853446874,
   1.1695827287627454,
   0.8044789613810966,
   2.3770864651

In [248]:
def find_book_similarities( feature_vector, book1, book2="", type="L2_BETWEEN_BOOKS"):
        if type == "L2_BETWEEN_BOOKS":
            book1_start = feature_vector[book1][0]; book1_end = feature_vector[book1][1]; 
            book2_start = feature_vector[book2][0]; book2_end = feature_vector[book2][1]; 
            book1_name = feature_vector[book1][2];
            book2_name  = feature_vector[book2][2];
            start_similarity = find_L2_similarity(book1_start, book2_start)            
            end_similarity = find_L2_similarity(book1_end, book2_end)
            
            
            #print(str(feature_vector[book1][0]))
            #print(str(feature_vector[book2][0]))
            #print("==== ====== ======= ======= ======")
            #print(str(feature_vector[book1][1])) 
            #print(str(feature_vector[book2][1]))
            #print()            
            #print("Start similarity and End similarity between {} and {} are : {} and {}".format(book1, book2, start_similarity, end_similarity))
            #print("Start similarity and End similarity between {} and {} are : {} and {}".format(book1, book2, start_similarity, end_similarity))
            #print()
            #print(" ======== =========== ============= ============ =========== ")
            return [ start_similarity, end_similarity, book1_name, book2_name];
        elif type == "COSINE":
            book1_start = feature_vector[book1][0]; book1_end = feature_vector[book1][1];
            book1_name = feature_vector[book1][2];
            book_similarity = find_Cosine_similarity(book1_start, book1_end)
            
            print("Similarity between start and end for the book {} is : {}".format(book1, book_similarity))
            #print("Similarity between start and end for the book {} is : {}".format(book1, book_similarity))
            #print()
            #print(" ======== =========== ============= ============ =========== ")
            return [ book_similarity, book1_name]
        elif type == "L2":
            book1_start = feature_vector[book1][0]; book1_end = feature_vector[book1][1];
            book1_name = feature_vector[book1][2]
            book_similarity = find_L2_similarity(book1_start, book1_end)
            
            print("Similarity between start and end for the book {} is : {}".format(book1, book_similarity))
            #print("Similarity between start and end for the book {} is : {}".format(book1, book_similarity))
            #print()
            #print(" ======== =========== ============= ============ =========== ")
            return [book_similarity, book1_name];
        else:
            raise Exception("Can only run for cosine and L2 similarity")
            return [None, None];


In [249]:
def find_Cosine_similarity( X, Y):
        x_distance = math.sqrt(sum([(a) ** 2 for a in X]))
        y_distance = math.sqrt(sum([(b) ** 2 for b in Y]))
        
        dot_product = sum([(a * b) for a, b in zip(X, Y)])
        print("x: {}, y: {}, dotproduct: {}".format(x_distance, y_distance, dot_product))
        similarity = dot_product / (x_distance * y_distance)
        
        return similarity;


In [250]:
def find_L2_similarity( X, Y):
        distance = math.sqrt(sum([(a - b) ** 2 for a, b in zip(X, Y)]))
        # print("Euclidean distance from x to y: {}".format(distance))
        similarity = 1 / (1 + distance)
        # print("Similarity is: {}".format(similarity))
        return similarity;

In [251]:
query_book1 = "pg21816"
query_book2 = "pg161"
start_similarity, end_similarity, book1_name, book2_name = find_book_similarities(book_feature_vector_dict, query_book1, query_book2, type="L2_BETWEEN_BOOKS")

In [252]:
book_list = list(book_feature_vector_dict.keys())
permutation_list = list(itertools.permutations(book_list, 2))
print(permutation_list[1:6])

[('pg537', 'pg161'), ('pg537', 'pg834'), ('pg537', 'pg2911'), ('pg537', 'pg11'), ('pg537', 'pg2684')]


In [253]:
def run_for_all_books(feature_vector, permutation_list, simtype = "L2_BETWEEN_BOOKS"):
    if(simtype == "L2_BETWEEN_BOOKS"):
        book1 = None; book2 = None; similarity_dict = {};
        book1_lst = []; book2_lst = []; book1_name_lst = []; book2_name_lst = []; start_sim_lst = []; end_sim_lst = [];
        for i in range(len(permutation_list)):
            book1 = permutation_list[i][0]
            book2 = permutation_list[i][1]
            start_sim , end_sim, book1_name, book2_name = find_book_similarities(feature_vector, book1, book2, type = "L2_BETWEEN_BOOKS")
            similarity_dict[str(book1) + "|" + str(book2) + "|" + str(book1_name) + "|" + str(book2_name)] = [start_sim, end_sim]
            book1_lst.append(book1); book2_lst.append(book2) ;
            book1_name_lst.append(book1_name); book2_name_lst.append(book2_name); 
            start_sim_lst.append(start_sim); end_sim_lst.append(end_sim)

        ind = [i for i in range(len(book1_lst))];

        # Create a zipped list of tuples from above lists
        zippedList =  list(zip(book1_lst, book2_lst, book1_name_lst, book2_name_lst, start_sim_lst, end_sim_lst))
        dfObj = pd.DataFrame(zippedList, columns = ['book1_id' , 'book2_id', 'book1_name', 'book2_name', 'start_sim', 'end_sim'], index=ind) 
        print(dfObj.head())
        return [similarity_dict, dfObj];
    elif(simtype == "L2"):
        book = None; similarity_dict = {};
        book_lst = []; book_name_lst = []; sim_lst = [];
        print(feature_vector.keys())
        for key in list(feature_vector.keys()):
            similarity, book_name = find_book_similarities(feature_vector, book1 = key, type = "L2")
            similarity_dict[key] = similarity
            book_lst.append(key); book_name_lst.append(book_name); sim_lst.append(similarity)
            ind = [i for i in range(len(book_lst))];
            zippedList =  list(zip(book_lst, book_name_lst, sim_lst))
            dfObj = pd.DataFrame(zippedList, columns = ['book_id' ,  'book_name', 'similarity'], index=ind) 
        return [similarity_dict, dfObj];
    else:
        return None;

In [254]:
similarity_dict, dfObj = run_for_all_books(book_feature_vector_dict, permutation_list, simtype = "L2")
print(book_feature_vector_dict.keys())

dict_keys(['pg537', 'pg1400', 'pg161', 'pg834', 'pg2911', 'pg11', 'pg2684', 'pg158', 'pg105', 'pg2919', 'pg2097', 'pg2149', 'pg34970', 'pg4045', 'pg21816', 'pg4765', 'pg4397', 'pg2150', 'pg13720', 'pg730', 'pg19337', 'pg620', 'pg29042', 'pg4763', 'pg766', 'pg2701', 'pg13721', 'pg2852', 'pg12', 'pg98', 'pg2148', 'pg1155', 'pg2147', 'pg1342', 'pg108', 'pg141', 'pg786'])
Similarity between start and end for the book pg537 is : 0.7417716444714493
Similarity between start and end for the book pg1400 is : 0.6566441189566189
Similarity between start and end for the book pg161 is : 0.6427596822070479
Similarity between start and end for the book pg834 is : 0.5600456588873279
Similarity between start and end for the book pg2911 is : 0.37711274911210296
Similarity between start and end for the book pg11 is : 0.4348028053680374
Similarity between start and end for the book pg2684 is : 0.48316667977202515
Similarity between start and end for the book pg158 is : 0.6171622023483011
Similarity betwee

In [255]:
similarity_dict

{'pg537': 0.7417716444714493,
 'pg1400': 0.6566441189566189,
 'pg161': 0.6427596822070479,
 'pg834': 0.5600456588873279,
 'pg2911': 0.37711274911210296,
 'pg11': 0.4348028053680374,
 'pg2684': 0.48316667977202515,
 'pg158': 0.6171622023483011,
 'pg105': 0.8741919531607095,
 'pg2919': 0.41236213957329365,
 'pg2097': 0.678286862060833,
 'pg2149': 0.4782837821663619,
 'pg34970': 0.4972343049393828,
 'pg4045': 0.519997442830225,
 'pg21816': 0.5633746559933585,
 'pg4765': 0.347652909320829,
 'pg4397': 0.602993384056208,
 'pg2150': 0.6640522308411263,
 'pg13720': 0.5871234559589501,
 'pg730': 0.5854544642248152,
 'pg19337': 0.5995600560376512,
 'pg620': 0.6712170067379889,
 'pg29042': 0.4342130196123464,
 'pg4763': 0.32564446475048087,
 'pg766': 0.8202494660963746,
 'pg2701': 0.6991535075777959,
 'pg13721': 0.5935985349613043,
 'pg2852': 0.3526293843942273,
 'pg12': 0.6544417542259907,
 'pg98': 0.41741430418213643,
 'pg2148': 0.5878211755720519,
 'pg1155': 0.6124542362231636,
 'pg2147': 0.56

In [256]:
dfObj.head(10)

Unnamed: 0,book_id,book_name,similarity
0,pg537,Tales of Terror and Mystery,0.741772
1,pg1400,Great Expectations,0.656644
2,pg161,Sense and Sensibility,0.64276
3,pg834,Memoirs of Shelock Holmes,0.560046
4,pg2911,Justice,0.377113
5,pg11,Alice's Adventures in Wonderland,0.434803
6,pg2684,Five Tales,0.483167
7,pg158,Emma,0.617162
8,pg105,Persuasion,0.874192
9,pg2919,The Little Man,0.412362


In [257]:
simtype = "L2"
if(simtype == "L2_BETWEEN_BOOKS"):
    dfObj["sum_sim"] = dfObj["start_sim"] + dfObj["end_sim"]
    dfObj.sort_values(by=["book1_id", "sum_sim", "start_sim", "end_sim"], ascending = [True, False, False, False], inplace = True)
    dfObj.head(10)
elif(simtype == "L2"):
    dfObj.sort_values(by=["similarity"], ascending = [False], inplace = True)
    dfObj.head(10)

In [258]:
if(simtype == "L2_BETWEEN_BOOKS"):
    final_df = dfObj.groupby('book1_id').head(10)
    final_df.head(100)
elif(simtype == "L2"):
    final_df = dfObj

In [259]:
if(simtype == "L2_BETWEEN_BOOKS"):
    final_df.to_csv("/Users/surajshashidhar/git/fiction/L2_between_books_20emp_15pcnt.csv", index = False)
elif(simtype == "L2"):
    final_df.to_csv("/Users/surajshashidhar/git/fiction/L2_10emp_25pcnt.csv", index = False)



In [260]:
final_df.head(40)

Unnamed: 0,book_id,book_name,similarity
8,pg105,Persuasion,0.874192
24,pg766,David Copperfield,0.820249
34,pg108,Return of Sherlock Holmes,0.786976
0,pg537,Tales of Terror and Mystery,0.741772
25,pg2701,Moby Dick,0.699154
33,pg1342,Pride and Prejudice,0.698631
10,pg2097,The Sign of Four,0.678287
21,pg620,Sylvie and Bruno,0.671217
17,pg2150,The Works of Edgar Allan Poe IV,0.664052
1,pg1400,Great Expectations,0.656644


In [261]:
efdf = pd.read_csv("/Users/surajshashidhar/git/fiction/Features_Extracted.csv", header = 0)
efdf.head(10)

Unnamed: 0,bookId-chunkNo,F0,F1,F2,F3,F4,F5,F6,F7,F8,...,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21
0,pg2684Galsw4-5,0.0266,0.4743,0.0844,0.0766,0.0265,0.0223,0.0308,0.0493,0.0413,...,0.0245,0.1105,-0.2611,0.0201,0.25,0.25,0.5,0.7673,-0.0176,1.2495
1,pg2684Galsw4-4,0.0234,0.351,0.1071,0.073,0.0344,0.0263,0.0279,0.054,0.0449,...,0.0183,0.1057,-0.2398,0.0168,0.3939,0.2121,0.3939,0.784,-0.0176,1.2495
2,pg2911Galsw2-1,0.0535,0.2497,0.0527,0.0923,0.0193,0.0225,0.0216,0.0513,0.1075,...,0.0093,0.087,-0.4715,0.0024,0.5591,0.129,0.3118,0.8524,-0.35,8.6291
3,pg2684Galsw4-7,0.0276,0.3596,0.1987,0.0646,0.028,0.027,0.0359,0.0678,0.0413,...,0.0155,0.1222,-0.1974,0.0211,0.3939,0.1212,0.4848,0.8031,-0.0176,1.2495
4,pg2684Galsw4-6,0.0234,0.4625,0.0525,0.0777,0.0289,0.0241,0.0312,0.0565,0.0409,...,0.0228,0.1149,-0.2434,0.019,0.375,0.0938,0.5313,0.8152,-0.0176,1.2495
5,pg2684Galsw4-1,0.0322,0.4385,0.1241,0.0755,0.0228,0.0266,0.0276,0.0528,0.043,...,0.0201,0.1059,-0.2836,0.0266,0.2188,0.25,0.5313,0.8141,-0.0176,1.2495
6,pg2684Galsw4-3,0.0219,0.4148,0.0606,0.0753,0.0286,0.0267,0.0325,0.0526,0.0426,...,0.0185,0.1095,-0.2325,0.0188,0.1563,0.3125,0.5313,0.7893,-0.0176,1.2495
7,pg2684Galsw4-2,0.015,0.5148,0.1424,0.0721,0.0262,0.031,0.0314,0.0553,0.0443,...,0.0145,0.1057,-0.2107,0.0103,0.3667,0.1333,0.5,0.7779,-0.0176,1.2495
8,pg1342JaneAustenPP-9,0.015,0.162,0.313,0.0684,0.0316,0.0251,0.039,0.059,0.0362,...,0.0073,0.1135,-0.0097,0.0121,0.087,0.3478,0.5652,0.6154,-0.2083,2.8595
9,pg1342JaneAustenPP-8,0.0089,0.1981,0.2257,0.0795,0.0366,0.0269,0.0366,0.0625,0.03,...,0.005,0.1152,0.1505,0.0061,0.0435,0.2609,0.6957,0.5394,-0.2083,2.8595


In [265]:
tmp_df = efdf["bookId-chunkNo"].str.split(r"(pg\d*).*?-(\d*)", n = 1, expand = True)
tmp_df.head(10)

Unnamed: 0,0,1,2,3
0,,pg2684,5,
1,,pg2684,4,
2,,pg2911,1,
3,,pg2684,7,
4,,pg2684,6,
5,,pg2684,1,
6,,pg2684,3,
7,,pg2684,2,
8,,pg1342,9,
9,,pg1342,8,


In [266]:
efdf["book_id"] = tmp_df[1]
efdf["chunk_id"] = tmp_df[2]
efdf.head(20)

Unnamed: 0,bookId-chunkNo,F0,F1,F2,F3,F4,F5,F6,F7,F8,...,F14,F15,F16,F17,F18,F19,F20,F21,book_id,chunk_id
0,pg2684Galsw4-5,0.0266,0.4743,0.0844,0.0766,0.0265,0.0223,0.0308,0.0493,0.0413,...,-0.2611,0.0201,0.25,0.25,0.5,0.7673,-0.0176,1.2495,pg2684,5
1,pg2684Galsw4-4,0.0234,0.351,0.1071,0.073,0.0344,0.0263,0.0279,0.054,0.0449,...,-0.2398,0.0168,0.3939,0.2121,0.3939,0.784,-0.0176,1.2495,pg2684,4
2,pg2911Galsw2-1,0.0535,0.2497,0.0527,0.0923,0.0193,0.0225,0.0216,0.0513,0.1075,...,-0.4715,0.0024,0.5591,0.129,0.3118,0.8524,-0.35,8.6291,pg2911,1
3,pg2684Galsw4-7,0.0276,0.3596,0.1987,0.0646,0.028,0.027,0.0359,0.0678,0.0413,...,-0.1974,0.0211,0.3939,0.1212,0.4848,0.8031,-0.0176,1.2495,pg2684,7
4,pg2684Galsw4-6,0.0234,0.4625,0.0525,0.0777,0.0289,0.0241,0.0312,0.0565,0.0409,...,-0.2434,0.019,0.375,0.0938,0.5313,0.8152,-0.0176,1.2495,pg2684,6
5,pg2684Galsw4-1,0.0322,0.4385,0.1241,0.0755,0.0228,0.0266,0.0276,0.0528,0.043,...,-0.2836,0.0266,0.2188,0.25,0.5313,0.8141,-0.0176,1.2495,pg2684,1
6,pg2684Galsw4-3,0.0219,0.4148,0.0606,0.0753,0.0286,0.0267,0.0325,0.0526,0.0426,...,-0.2325,0.0188,0.1563,0.3125,0.5313,0.7893,-0.0176,1.2495,pg2684,3
7,pg2684Galsw4-2,0.015,0.5148,0.1424,0.0721,0.0262,0.031,0.0314,0.0553,0.0443,...,-0.2107,0.0103,0.3667,0.1333,0.5,0.7779,-0.0176,1.2495,pg2684,2
8,pg1342JaneAustenPP-9,0.015,0.162,0.313,0.0684,0.0316,0.0251,0.039,0.059,0.0362,...,-0.0097,0.0121,0.087,0.3478,0.5652,0.6154,-0.2083,2.8595,pg1342,9
9,pg1342JaneAustenPP-8,0.0089,0.1981,0.2257,0.0795,0.0366,0.0269,0.0366,0.0625,0.03,...,0.1505,0.0061,0.0435,0.2609,0.6957,0.5394,-0.2083,2.8595,pg1342,8


In [277]:
redf = pd.merge(left=efdf, right=df, how='inner', left_on = ["book_id"], right_on = ["bid"])
print(redf.describe())
redf.head(10)

               F0          F1          F2          F3          F4          F5  \
count  405.000000  405.000000  405.000000  405.000000  405.000000  405.000000   
mean     0.015618    0.233379    0.112360    0.071273    0.025199    0.026818   
std      0.007020    0.098301    0.093679    0.017309    0.005760    0.003238   
min      0.002400    0.023100    0.000000    0.032800    0.008500    0.019400   
25%      0.011000    0.163700    0.032600    0.057300    0.021600    0.024500   
50%      0.015000    0.220500    0.090600    0.074100    0.025300    0.026600   
75%      0.018900    0.282100    0.169500    0.085900    0.028600    0.029000   
max      0.060400    0.567700    0.453400    0.104600    0.043600    0.038900   

               F6          F7          F8          F9  ...  start_surprise  \
count  405.000000  405.000000  405.000000  405.000000  ...      405.000000   
mean     0.034741    0.072537    0.036079    0.001643  ...        0.083467   
std      0.005026    0.011738    0.0

Unnamed: 0,bookId-chunkNo,F0,F1,F2,F3,F4,F5,F6,F7,F8,...,end_anger,end_anticipation,end_disgust,end_fear,end_joy,end_sadness,end_surprise,end_trust,bname,bid
0,pg2684Galsw4-5,0.0266,0.4743,0.0844,0.0766,0.0265,0.0223,0.0308,0.0493,0.0413,...,0.090419,0.18541,0.056443,0.095811,0.165089,0.121361,0.092195,0.193271,Five Tales,pg2684
1,pg2684Galsw4-4,0.0234,0.351,0.1071,0.073,0.0344,0.0263,0.0279,0.054,0.0449,...,0.090419,0.18541,0.056443,0.095811,0.165089,0.121361,0.092195,0.193271,Five Tales,pg2684
2,pg2684Galsw4-7,0.0276,0.3596,0.1987,0.0646,0.028,0.027,0.0359,0.0678,0.0413,...,0.090419,0.18541,0.056443,0.095811,0.165089,0.121361,0.092195,0.193271,Five Tales,pg2684
3,pg2684Galsw4-6,0.0234,0.4625,0.0525,0.0777,0.0289,0.0241,0.0312,0.0565,0.0409,...,0.090419,0.18541,0.056443,0.095811,0.165089,0.121361,0.092195,0.193271,Five Tales,pg2684
4,pg2684Galsw4-1,0.0322,0.4385,0.1241,0.0755,0.0228,0.0266,0.0276,0.0528,0.043,...,0.090419,0.18541,0.056443,0.095811,0.165089,0.121361,0.092195,0.193271,Five Tales,pg2684
5,pg2684Galsw4-3,0.0219,0.4148,0.0606,0.0753,0.0286,0.0267,0.0325,0.0526,0.0426,...,0.090419,0.18541,0.056443,0.095811,0.165089,0.121361,0.092195,0.193271,Five Tales,pg2684
6,pg2684Galsw4-2,0.015,0.5148,0.1424,0.0721,0.0262,0.031,0.0314,0.0553,0.0443,...,0.090419,0.18541,0.056443,0.095811,0.165089,0.121361,0.092195,0.193271,Five Tales,pg2684
7,pg2684Galsw4-9,0.0118,0.4907,0.145,0.0574,0.0288,0.0298,0.0339,0.0638,0.0356,...,0.090419,0.18541,0.056443,0.095811,0.165089,0.121361,0.092195,0.193271,Five Tales,pg2684
8,pg2684Galsw4-8,0.0218,0.3996,0.1737,0.0727,0.0269,0.0282,0.0387,0.0679,0.0359,...,0.090419,0.18541,0.056443,0.095811,0.165089,0.121361,0.092195,0.193271,Five Tales,pg2684
9,pg2684Galsw4-12,0.0073,0.5543,0.1653,0.0744,0.0302,0.0323,0.045,0.056,0.0407,...,0.090419,0.18541,0.056443,0.095811,0.165089,0.121361,0.092195,0.193271,Five Tales,pg2684


In [278]:
l2df = pd.read_csv("/Users/surajshashidhar/git/fiction/L2_10emp_20pcnt.csv", header = 0)

In [280]:
l2df.head(40)

Unnamed: 0,book_id,book_name,similarity
0,pg108,Return of Sherlock Holmes,0.777252
1,pg105,Persuasion,0.771771
2,pg766,David Copperfield,0.729004
3,pg2097,The Sign of Four,0.665281
4,pg2701,Moby Dick,0.659062
5,pg1342,Pride and Prejudice,0.653537
6,pg1400,Great Expectations,0.650252
7,pg537,Tales of Terror and Mystery,0.643721
8,pg12,Through the Looking-Glass,0.629213
9,pg2150,The Works of Edgar Allan Poe IV,0.620253


In [281]:
redf_f = pd.merge(left=redf, right=l2df, how='inner', left_on = ["book_id"], right_on = ["book_id"], suffixes=('_x', '_y'))
print(redf_f.describe())
redf_f.head(10)

               F0          F1          F2          F3          F4          F5  \
count  405.000000  405.000000  405.000000  405.000000  405.000000  405.000000   
mean     0.015618    0.233379    0.112360    0.071273    0.025199    0.026818   
std      0.007020    0.098301    0.093679    0.017309    0.005760    0.003238   
min      0.002400    0.023100    0.000000    0.032800    0.008500    0.019400   
25%      0.011000    0.163700    0.032600    0.057300    0.021600    0.024500   
50%      0.015000    0.220500    0.090600    0.074100    0.025300    0.026600   
75%      0.018900    0.282100    0.169500    0.085900    0.028600    0.029000   
max      0.060400    0.567700    0.453400    0.104600    0.043600    0.038900   

               F6          F7          F8          F9  ...  start_trust  \
count  405.000000  405.000000  405.000000  405.000000  ...   405.000000   
mean     0.034741    0.072537    0.036079    0.001643  ...     0.248523   
std      0.005026    0.011738    0.010072    

Unnamed: 0,bookId-chunkNo,F0,F1,F2,F3,F4,F5,F6,F7,F8,...,end_disgust,end_fear,end_joy,end_sadness,end_surprise,end_trust,bname,bid,book_name,similarity
0,pg2684Galsw4-5,0.0266,0.4743,0.0844,0.0766,0.0265,0.0223,0.0308,0.0493,0.0413,...,0.056443,0.095811,0.165089,0.121361,0.092195,0.193271,Five Tales,pg2684,Five Tales,0.459274
1,pg2684Galsw4-4,0.0234,0.351,0.1071,0.073,0.0344,0.0263,0.0279,0.054,0.0449,...,0.056443,0.095811,0.165089,0.121361,0.092195,0.193271,Five Tales,pg2684,Five Tales,0.459274
2,pg2684Galsw4-7,0.0276,0.3596,0.1987,0.0646,0.028,0.027,0.0359,0.0678,0.0413,...,0.056443,0.095811,0.165089,0.121361,0.092195,0.193271,Five Tales,pg2684,Five Tales,0.459274
3,pg2684Galsw4-6,0.0234,0.4625,0.0525,0.0777,0.0289,0.0241,0.0312,0.0565,0.0409,...,0.056443,0.095811,0.165089,0.121361,0.092195,0.193271,Five Tales,pg2684,Five Tales,0.459274
4,pg2684Galsw4-1,0.0322,0.4385,0.1241,0.0755,0.0228,0.0266,0.0276,0.0528,0.043,...,0.056443,0.095811,0.165089,0.121361,0.092195,0.193271,Five Tales,pg2684,Five Tales,0.459274
5,pg2684Galsw4-3,0.0219,0.4148,0.0606,0.0753,0.0286,0.0267,0.0325,0.0526,0.0426,...,0.056443,0.095811,0.165089,0.121361,0.092195,0.193271,Five Tales,pg2684,Five Tales,0.459274
6,pg2684Galsw4-2,0.015,0.5148,0.1424,0.0721,0.0262,0.031,0.0314,0.0553,0.0443,...,0.056443,0.095811,0.165089,0.121361,0.092195,0.193271,Five Tales,pg2684,Five Tales,0.459274
7,pg2684Galsw4-9,0.0118,0.4907,0.145,0.0574,0.0288,0.0298,0.0339,0.0638,0.0356,...,0.056443,0.095811,0.165089,0.121361,0.092195,0.193271,Five Tales,pg2684,Five Tales,0.459274
8,pg2684Galsw4-8,0.0218,0.3996,0.1737,0.0727,0.0269,0.0282,0.0387,0.0679,0.0359,...,0.056443,0.095811,0.165089,0.121361,0.092195,0.193271,Five Tales,pg2684,Five Tales,0.459274
9,pg2684Galsw4-12,0.0073,0.5543,0.1653,0.0744,0.0302,0.0323,0.045,0.056,0.0407,...,0.056443,0.095811,0.165089,0.121361,0.092195,0.193271,Five Tales,pg2684,Five Tales,0.459274


In [283]:
redf_f.to_csv("/Users/surajshashidhar/git/fiction/All_features_extracted.csv", index = False)

In [3]:
feature_df = pd.read_csv("/Users/surajshashidhar/git/fiction/Features_Extracted_milestone3_withf3.csv", header = 0)
tmp_df = feature_df["bookId-chunkNo"].str.split("-", n = 1, expand = True) 
feature_df.head()

Unnamed: 0,bookId-chunkNo,F0,F1,F2,F3,F4,F5,F6,F7,F8,...,F26,F27,F28,F29,F30,F31,F32,F33,F34,F35
0,pg1257-20,0.0288,0.2705,0.065,0.0697,0.0242,0.0236,0.0273,0.0768,0.0368,...,0.000913,-0.112385,-0.00718,-0.183238,0.011509,-0.164683,0.002138,-0.148766,-0.00738,0.6226
1,pg1257-22,0.0264,0.2225,0.1531,0.066,0.0248,0.026,0.0306,0.0803,0.0384,...,0.000913,-0.112385,-0.00718,-0.183238,0.011509,-0.164683,0.002138,-0.148766,-0.00738,0.6226
2,pg1257-21,0.0371,0.1893,0.0449,0.0628,0.0196,0.0208,0.027,0.0757,0.0464,...,0.000913,-0.112385,-0.00718,-0.183238,0.011509,-0.164683,0.002138,-0.148766,-0.00738,0.6226
3,pg1257-24,0.024,0.153,0.194,0.0842,0.0282,0.0188,0.0267,0.0802,0.0353,...,0.000913,-0.112385,-0.00718,-0.183238,0.011509,-0.164683,0.002138,-0.148766,-0.00738,0.6226
4,pg1257-23,0.0189,0.2073,0.2255,0.0707,0.0335,0.0242,0.0302,0.0748,0.0367,...,0.000913,-0.112385,-0.00718,-0.183238,0.011509,-0.164683,0.002138,-0.148766,-0.00738,0.6226


In [4]:
tmp_df.head()

Unnamed: 0,0,1
0,pg1257,20
1,pg1257,22
2,pg1257,21
3,pg1257,24
4,pg1257,23


In [5]:
feature_df["book_id"]= tmp_df[0] 
feature_df["chunk_id"]= tmp_df[0] 

In [6]:
f3_df = pd.read_csv("/Users/surajshashidhar/git/fiction/feature3_vectors_milestone3_english.csv", header = 0)
f3_df.head()

Unnamed: 0,start_anger,start_anticipation,start_disgust,start_fear,start_joy,start_sadness,start_surprise,start_trust,end_anger,end_anticipation,end_disgust,end_fear,end_joy,end_sadness,end_surprise,end_trust,bname,bid
0,0.94298,1.563349,0.397752,1.015185,1.400774,1.063435,1.000725,2.615801,0.828533,1.522968,0.503072,1.132964,1.171348,1.356811,0.943404,2.5409,Roland Cashel- Volume I (of II),pg33468
1,0.667114,1.913909,0.802892,0.876792,1.432745,1.015008,0.964494,2.327047,0.763426,1.55232,0.537831,1.123303,1.54708,0.81474,0.742644,2.918656,Dorothy and the Wizard in Oz,pg420
2,0.933105,1.558126,0.556193,1.304707,1.28008,1.25946,1.155841,1.952488,0.886174,1.998175,0.438925,1.130522,1.379062,1.264448,1.218642,1.684051,The Gambler: A Novel,pg33490
3,0.580084,1.592614,0.412744,1.071454,0.848571,1.165474,0.697628,3.631432,0.997848,1.882936,0.667973,1.65684,0.716712,1.209509,0.797074,2.071106,The Hound of the Baskervilles,pg2852
4,0.840717,1.586577,0.523278,1.0069,1.599138,1.144708,0.972399,2.326284,0.756574,1.895529,0.411796,1.1867,1.468204,1.23607,0.742184,2.302943,"Sea and Shore - A Sequel to ""Miriam's Memoirs""",pg15117


In [7]:
out_df = pd.merge(left = feature_df, right = f3_df, how = "inner", left_on = ["book_id"], right_on =["bid"])
out_df.head()

Unnamed: 0,bookId-chunkNo,F0,F1,F2,F3,F4,F5,F6,F7,F8,...,end_anger,end_anticipation,end_disgust,end_fear,end_joy,end_sadness,end_surprise,end_trust,bname,bid
0,pg1257-20,0.0288,0.2705,0.065,0.0697,0.0242,0.0236,0.0273,0.0768,0.0368,...,1.012851,1.45325,0.485158,1.576651,1.117743,1.432855,0.680606,2.240886,The Three Musketeers,pg1257
1,pg1257-22,0.0264,0.2225,0.1531,0.066,0.0248,0.026,0.0306,0.0803,0.0384,...,1.012851,1.45325,0.485158,1.576651,1.117743,1.432855,0.680606,2.240886,The Three Musketeers,pg1257
2,pg1257-21,0.0371,0.1893,0.0449,0.0628,0.0196,0.0208,0.027,0.0757,0.0464,...,1.012851,1.45325,0.485158,1.576651,1.117743,1.432855,0.680606,2.240886,The Three Musketeers,pg1257
3,pg1257-24,0.024,0.153,0.194,0.0842,0.0282,0.0188,0.0267,0.0802,0.0353,...,1.012851,1.45325,0.485158,1.576651,1.117743,1.432855,0.680606,2.240886,The Three Musketeers,pg1257
4,pg1257-23,0.0189,0.2073,0.2255,0.0707,0.0335,0.0242,0.0302,0.0748,0.0367,...,1.012851,1.45325,0.485158,1.576651,1.117743,1.432855,0.680606,2.240886,The Three Musketeers,pg1257


In [8]:
out_df.describe()

Unnamed: 0,F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,start_surprise,start_trust,end_anger,end_anticipation,end_disgust,end_fear,end_joy,end_sadness,end_surprise,end_trust
count,1708.0,1708.0,1708.0,1708.0,1708.0,1708.0,1708.0,1708.0,1708.0,1708.0,...,1708.0,1708.0,1708.0,1708.0,1708.0,1708.0,1708.0,1708.0,1708.0,1708.0
mean,0.017506,0.26149,0.139611,0.075179,0.024072,0.025683,0.034894,0.066088,0.039737,0.00129,...,0.841444,2.399689,0.908867,1.659316,0.604684,1.228146,1.331193,1.201075,0.838398,2.228321
std,0.007843,0.118117,0.101802,0.016987,0.005683,0.003601,0.006878,0.013187,0.010234,0.001944,...,0.159292,0.419349,0.194195,0.271955,0.160343,0.273876,0.40801,0.196815,0.163751,0.379962
min,0.0004,0.0157,0.0,0.0163,0.0064,0.0162,0.0156,0.0244,0.0167,0.0,...,0.144444,1.276499,0.098725,0.729962,0.127658,0.227679,0.548925,0.628195,0.314286,0.911197
25%,0.0127,0.17755,0.057775,0.0647,0.0203,0.0232,0.0305,0.0569,0.032575,0.0003,...,0.771685,2.111508,0.76999,1.472375,0.496673,1.025371,1.08604,1.073009,0.735662,1.999272
50%,0.0169,0.2434,0.124,0.0756,0.0243,0.0254,0.0347,0.0656,0.0386,0.0007,...,0.841675,2.39728,0.89878,1.690757,0.58116,1.1867,1.290729,1.183169,0.814451,2.200618
75%,0.0215,0.3274,0.204775,0.086925,0.0278,0.0279,0.0388,0.076,0.0451,0.0015,...,0.951606,2.659149,1.022547,1.848932,0.662858,1.369884,1.500082,1.321785,0.935361,2.517997
max,0.1029,0.7012,0.523,0.1239,0.0452,0.0504,0.0687,0.1098,0.1145,0.0404,...,1.239909,4.824248,1.770176,2.632097,1.277526,2.256677,3.759273,2.355431,1.545988,3.455556


In [13]:
print(out_df.columns)
len(out_df.columns)

Index(['bookId-chunkNo', 'F0', 'F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8',
       'F9', 'F10', 'F11', 'F12', 'F13', 'F14', 'F15', 'F16', 'F17', 'F18',
       'F19', 'F20', 'F21', 'F22', 'F23', 'F24', 'F25', 'F26', 'F27', 'F28',
       'F29', 'F30', 'F31', 'F32', 'F33', 'F34', 'F35', 'book_id', 'chunk_id',
       'start_anger', 'start_anticipation', 'start_disgust', 'start_fear',
       'start_joy', 'start_sadness', 'start_surprise', 'start_trust',
       'end_anger', 'end_anticipation', 'end_disgust', 'end_fear', 'end_joy',
       'end_sadness', 'end_surprise', 'end_trust', 'bname', 'bid'],
      dtype='object')


57

['same_book_start_end_similarity', 'start_anger', 'start_anticipation', 'start_disgust', 'start_fear', 'start_joy', 'start_sadness', 'start_surprise', 'start_trust', 'end_anger', 'end_anticipation', 'end_disgust', 'end_fear', 'end_joy', 'end_sadness', 'end_surprise', 'end_trust']

F34-F50 (17 columns) --> feature 3

In [19]:
cols = ['bookId-chunkNo', 'F0', 'F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8',
       'F9', 'F10', 'F11', 'F12', 'F13', 'F14', 'F15', 'F16', 'F17', 'F18',
       'F19', 'F20', 'F21', 'F22', 'F23', 'F24', 'F25', 'F26', 'F27', 'F28',
       'F29', 'F30', 'F31', 'F32', 'F33', 'F34', 'F35', 'book_id', 'chunk_id',
       'F36', 'F37', 'F38', 'F39','F40', 'F41', 'F42', 'F43',
       'F44', 'F45', 'F46', 'F47', 'F48','F49', 'F50', 'F51', 'bname', 'bid']
print(len(cols))
required_cols = ['bookId-chunkNo', 'F0', 'F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8',
       'F9', 'F10', 'F11', 'F12', 'F13', 'F14', 'F15', 'F16', 'F17', 'F18',
       'F19', 'F20', 'F21', 'F22', 'F23', 'F24', 'F25', 'F26', 'F27', 'F28',
       'F29', 'F30', 'F31', 'F32', 'F33', 'F34', 'F35', 'F36', 'F37', 'F38', 'F39',
       'F40', 'F41', 'F42', 'F43', 'F44', 'F45', 'F46', 'F47', 'F48', 'F49', 'F50']

57


In [21]:
out_df.columns = cols

In [22]:
out_df[required_cols].to_csv("/Users/surajshashidhar/git/fiction/All_milestone3_features.csv", index = False)

In [3]:
input_df = pd.read_csv("/Users/surajshashidhar/git/fiction/All_milestone3_features.csv", header = 0)
input_df.head()

Unnamed: 0,bookId-chunkNo,F0,F1,F2,F3,F4,F5,F6,F7,F8,...,F41,F42,F43,F44,F45,F46,F47,F48,F49,F50
0,pg1257-20,0.0288,0.2705,0.065,0.0697,0.0242,0.0236,0.0273,0.0768,0.0368,...,0.707087,2.548425,1.012851,1.45325,0.485158,1.576651,1.117743,1.432855,0.680606,2.240886
1,pg1257-22,0.0264,0.2225,0.1531,0.066,0.0248,0.026,0.0306,0.0803,0.0384,...,0.707087,2.548425,1.012851,1.45325,0.485158,1.576651,1.117743,1.432855,0.680606,2.240886
2,pg1257-21,0.0371,0.1893,0.0449,0.0628,0.0196,0.0208,0.027,0.0757,0.0464,...,0.707087,2.548425,1.012851,1.45325,0.485158,1.576651,1.117743,1.432855,0.680606,2.240886
3,pg1257-24,0.024,0.153,0.194,0.0842,0.0282,0.0188,0.0267,0.0802,0.0353,...,0.707087,2.548425,1.012851,1.45325,0.485158,1.576651,1.117743,1.432855,0.680606,2.240886
4,pg1257-23,0.0189,0.2073,0.2255,0.0707,0.0335,0.0242,0.0302,0.0748,0.0367,...,0.707087,2.548425,1.012851,1.45325,0.485158,1.576651,1.117743,1.432855,0.680606,2.240886


In [6]:
input_df[['F34', 'F35', 'F36', 'F37', 'F38', 'F39',
       'F40', 'F41', 'F42', 'F43', 'F44', 'F45', 'F46', 'F47', 'F48', 'F49', 'F50']].head(30)

Unnamed: 0,F34,F35,F36,F37,F38,F39,F40,F41,F42,F43,F44,F45,F46,F47,F48,F49,F50
0,0.6226,0.89352,1.725288,0.443957,1.299732,1.247111,1.134879,0.707087,2.548425,1.012851,1.45325,0.485158,1.576651,1.117743,1.432855,0.680606,2.240886
1,0.6226,0.89352,1.725288,0.443957,1.299732,1.247111,1.134879,0.707087,2.548425,1.012851,1.45325,0.485158,1.576651,1.117743,1.432855,0.680606,2.240886
2,0.6226,0.89352,1.725288,0.443957,1.299732,1.247111,1.134879,0.707087,2.548425,1.012851,1.45325,0.485158,1.576651,1.117743,1.432855,0.680606,2.240886
3,0.6226,0.89352,1.725288,0.443957,1.299732,1.247111,1.134879,0.707087,2.548425,1.012851,1.45325,0.485158,1.576651,1.117743,1.432855,0.680606,2.240886
4,0.6226,0.89352,1.725288,0.443957,1.299732,1.247111,1.134879,0.707087,2.548425,1.012851,1.45325,0.485158,1.576651,1.117743,1.432855,0.680606,2.240886
5,0.6226,0.89352,1.725288,0.443957,1.299732,1.247111,1.134879,0.707087,2.548425,1.012851,1.45325,0.485158,1.576651,1.117743,1.432855,0.680606,2.240886
6,0.6226,0.89352,1.725288,0.443957,1.299732,1.247111,1.134879,0.707087,2.548425,1.012851,1.45325,0.485158,1.576651,1.117743,1.432855,0.680606,2.240886
7,0.6226,0.89352,1.725288,0.443957,1.299732,1.247111,1.134879,0.707087,2.548425,1.012851,1.45325,0.485158,1.576651,1.117743,1.432855,0.680606,2.240886
8,0.6226,0.89352,1.725288,0.443957,1.299732,1.247111,1.134879,0.707087,2.548425,1.012851,1.45325,0.485158,1.576651,1.117743,1.432855,0.680606,2.240886
9,0.6226,0.89352,1.725288,0.443957,1.299732,1.247111,1.134879,0.707087,2.548425,1.012851,1.45325,0.485158,1.576651,1.117743,1.432855,0.680606,2.240886


In [7]:
input_df[['F34', 'F35', 'F36', 'F37', 'F38', 'F39',
       'F40', 'F41', 'F42', 'F43', 'F44', 'F45', 'F46', 'F47', 'F48', 'F49', 'F50']] = input_df[['F34', 'F35', 'F36', 'F37', 'F38', 'F39',
       'F40', 'F41', 'F42', 'F43', 'F44', 'F45', 'F46', 'F47', 'F48', 'F49', 'F50']]/10.0

In [8]:
input_df[['F34', 'F35', 'F36', 'F37', 'F38', 'F39',
       'F40', 'F41', 'F42', 'F43', 'F44', 'F45', 'F46', 'F47', 'F48', 'F49', 'F50']].head(30)

Unnamed: 0,F34,F35,F36,F37,F38,F39,F40,F41,F42,F43,F44,F45,F46,F47,F48,F49,F50
0,0.06226,0.089352,0.172529,0.044396,0.129973,0.124711,0.113488,0.070709,0.254842,0.101285,0.145325,0.048516,0.157665,0.111774,0.143285,0.068061,0.224089
1,0.06226,0.089352,0.172529,0.044396,0.129973,0.124711,0.113488,0.070709,0.254842,0.101285,0.145325,0.048516,0.157665,0.111774,0.143285,0.068061,0.224089
2,0.06226,0.089352,0.172529,0.044396,0.129973,0.124711,0.113488,0.070709,0.254842,0.101285,0.145325,0.048516,0.157665,0.111774,0.143285,0.068061,0.224089
3,0.06226,0.089352,0.172529,0.044396,0.129973,0.124711,0.113488,0.070709,0.254842,0.101285,0.145325,0.048516,0.157665,0.111774,0.143285,0.068061,0.224089
4,0.06226,0.089352,0.172529,0.044396,0.129973,0.124711,0.113488,0.070709,0.254842,0.101285,0.145325,0.048516,0.157665,0.111774,0.143285,0.068061,0.224089
5,0.06226,0.089352,0.172529,0.044396,0.129973,0.124711,0.113488,0.070709,0.254842,0.101285,0.145325,0.048516,0.157665,0.111774,0.143285,0.068061,0.224089
6,0.06226,0.089352,0.172529,0.044396,0.129973,0.124711,0.113488,0.070709,0.254842,0.101285,0.145325,0.048516,0.157665,0.111774,0.143285,0.068061,0.224089
7,0.06226,0.089352,0.172529,0.044396,0.129973,0.124711,0.113488,0.070709,0.254842,0.101285,0.145325,0.048516,0.157665,0.111774,0.143285,0.068061,0.224089
8,0.06226,0.089352,0.172529,0.044396,0.129973,0.124711,0.113488,0.070709,0.254842,0.101285,0.145325,0.048516,0.157665,0.111774,0.143285,0.068061,0.224089
9,0.06226,0.089352,0.172529,0.044396,0.129973,0.124711,0.113488,0.070709,0.254842,0.101285,0.145325,0.048516,0.157665,0.111774,0.143285,0.068061,0.224089


In [9]:
input_df.columns

Index(['bookId-chunkNo', 'F0', 'F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8',
       'F9', 'F10', 'F11', 'F12', 'F13', 'F14', 'F15', 'F16', 'F17', 'F18',
       'F19', 'F20', 'F21', 'F22', 'F23', 'F24', 'F25', 'F26', 'F27', 'F28',
       'F29', 'F30', 'F31', 'F32', 'F33', 'F34', 'F35', 'F36', 'F37', 'F38',
       'F39', 'F40', 'F41', 'F42', 'F43', 'F44', 'F45', 'F46', 'F47', 'F48',
       'F49', 'F50'],
      dtype='object')

In [10]:
input_df.to_csv("/Users/surajshashidhar/git/fiction/All_milestone3_features.csv", index = False)