# Pipeline Example

In [1]:
#=================================================================
# Use Case
#================================================================= 

import pipeline.pipeline as p
import pandas as pd

data = pd.read_csv("Dataset_v2/cleaned/Mark/TheAdventuresOfTomSawyer.csv")

# Run data through pipeline to get metrics applicable for all datasets.
data = p.standardPipeline(data)

# For reading metrics, we require a sentence count column. Add the column
# yourself, depending on your needs, and add it as a second parameter.

data["sentenceCount"] = 2 # My sophisticated count estimation method
data = p.readabilityPipeline(data, "sentenceCount")

# This function will drop columns that are artifacts of the creation of other
# columns.

#data = p.dropIntermediateCols(data)
data.head()

[nltk_data] Downloading package cmudict to
[nltk_data]     /Users/kyleeschen/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kyleeschen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,LineNum,ParagraphNum,ChapterNum,DoubleQuoted,Text,words,avgWordLength,sentenceLengthByChar,specicalCharacterCount,avgSyllablesPerWord,...,PROPN,PUNCT,SCONJ,SPACE,VERB,sentenceCount,gunningFoxIndex,daleChallReadability,fleschReadingEase,fleschCincadeGradeLevel
0,1,1,1,1,TOM!,[TOM!],4.0,4,1,1.0,...,1.0,1.0,0.0,0.0,0.0,2,0.2,19.4513,121.7275,-3.595
1,2,2,1,0,No answer.,"[No, answer.]",4.5,10,1,1.5,...,0.0,1.0,0.0,0.0,0.0,2,0.4,19.4761,78.92,2.5
2,3,3,1,1,TOM!,[TOM!],4.0,4,1,1.0,...,1.0,1.0,0.0,0.0,0.0,2,0.2,19.4513,121.7275,-3.595
3,4,4,1,0,No answer.,"[No, answer.]",4.5,10,1,1.5,...,0.0,1.0,0.0,0.0,0.0,2,0.4,19.4761,78.92,2.5
4,5,5,1,1,"What’s gone with that boy, I wonder?","[What’s, gone, with, that, boy,, I, wonder?]",4.4,37,2,1.142857,...,0.0,2.0,0.0,1.0,3.0,2,1.4,10.577243,106.596786,-0.739286


In [None]:
#=================================================================
# Adding On Spacy Tokens
#================================================================= 

import pipeline.tokens as t 

data = t.tokenPipeline(data)
data.head()

# Conversion to JSON

In [26]:
# Generate some user data, then run it through the pipeline

myVersion = '''
I went to the beach and stretched myself out.

A ferryboat trudged across the river, devoid of any other lights.

The moon rose and I become aware of the 

I like to eat eggs and toast.

And fish - fish is very tasty.

Do I like eggs or toast or fish more? It's difficult to say. But I think I like fish more.

'''

user = pd.DataFrame([str(s) for s in nlp(myVersion).sents], columns = ["Text"])
user["Author"] = "User"
user = standardPipeline(user, "Text")

In [7]:
#==================================================================================
# Run code for one book for each author
#==================================================================================

def filepath_to_dataframe(fp, author):
    data = pd.read_csv("Dataset_v2/cleaned/" + fp)
    data = standardPipeline(data)
    data = dropIntermediateCols(data)
    data["Author"] = author
    return data

twain = filepath_to_dataframe("Mark/TheAdventuresOfTomSawyer.csv", "Twain")
austen = filepath_to_dataframe("Jane/Emma_Jane_Austen.csv", "Austen")
fitzgerald = filepath_to_dataframe("Fitzgerald/The_Great_Gatsby.csv", "Fitzgerald")
dickens = filepath_to_dataframe("CharlesDickens/ATaleOfTwoCities.csv", "Dickens")

In [31]:
#==================================================================================
# Concat all dataframes 
#==================================================================================
# (I'll write smoother code for this once we decide what filters we want to use)
data = pd.concat([user, austen, fitzgerald, twain, dickens], axis = 0).reset_index()
data = data[["Author", "sentenceLengthByChar", "shannonEntropy"]]
data.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """


Unnamed: 0,Author,sentenceLengthByChar,shannonEntropy
0,User,48,2.197225
1,User,67,2.397895
2,User,42,2.197225
3,User,31,1.94591
4,User,32,1.747868


In [32]:
#==================================================================================
# Convert to Longform
#==================================================================================
from pipeline import metrics as m

data = m.input_to_stats(data, m.METRICS)
data.head()

Unnamed: 0,Author,Metric,Statistic,Value
0,Austen,sentenceLengthByChar,mean,116.158099
1,Austen,shannonEntropy,mean,2.55523
2,Dickens,sentenceLengthByChar,mean,99.091605
3,Dickens,shannonEntropy,mean,2.374118
4,Fitzgerald,sentenceLengthByChar,mean,79.685872


In [33]:
#==================================================================================
# Save
#==================================================================================

data.to_json("Metrics/comparison.json", orient="records")

In [37]:
temp = pd.concat([fitzgerald, user], axis= 0).reset_index()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [46]:
temp[["Author", "Text","sentenceLengthByChar", "shannonEntropy", "Probability"]] \
    .to_json("front-end/templates/static/data/scatter.json", orient = "records")

In [44]:
temp['Probability'] = np.random.randint(0, 100, temp.shape[0])