In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [2]:
import nltk
# Launch the installer to download "gutenberg" and "stop words" corpora.
#nltk.download()

In [3]:
from nltk.corpus import gutenberg, stopwords
print (gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [148]:
persuasion = gutenberg.raw("austen-persuasion.txt")
alice = gutenberg.raw("carroll-alice.txt")
paradise      = gutenberg.raw('milton-paradise.txt')

print("\Raw:\n",alice[0:100])

\Raw:
 [Alice's Adventures in Wonderland by Lewis Carroll 1865]

CHAPTER I. Down the Rabbit-Hole

Alice was


In [149]:
# Remove title
pattern = "[\[].*?[\]]"
persuasion = re.sub(pattern, "", persuasion)
alice = re.sub(pattern, "", alice)
paradise = re.sub(pattern, "", paradise)

print("Title removed:\n", alice[0:100])

Title removed:
 

CHAPTER I. Down the Rabbit-Hole

Alice was beginning to get very tired of sitting by her sister on


In [151]:
persuasion[0:100]

'\n\n\nChapter 1\n\n\nSir Walter Elliot, of Kellynch Hall, in Somersetshire, was a man who,\nfor his own amu'

In [154]:
# Remove chpater headings
persuasion = re.sub(r'Chapter \d+',"", persuasion)
alice = re.sub(r"CHAPTER .*","", alice)
paradise = re.sub(r"Book .*", "",paradise)

print("Chapter headings remove:\n", alice[0:100])

Chapter headings remove:
 



Alice was beginning to get very tired of sitting by her sister on the
bank, and of having nothin


In [156]:
# Remove newlines and extrawhitespace by splitting and rejoining
persuasion = " ".join(persuasion.split())
alice = " ".join(alice.split())
paradise = " ".join(paradise.split())
print("Extra whitespace removed:\n",alice[0:100])

Extra whitespace removed:
 Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to


In [8]:
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [157]:
import spacy
nlp = spacy.load("en")

alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)
paradise_doc = nlp(paradise)

In [10]:
# Let's explore the objects we've built.
print("The alice_doc object is a {} object.".format(type(alice_doc)))
print("It is {} tokens long".format(len(alice_doc)))
print("The first three tokens are '{}'".format(alice_doc[:3]))
print("The type of each token is {}".format(type(alice_doc[0])))

The alice_doc object is a <class 'spacy.tokens.doc.Doc'> object.
It is 34430 tokens long
The first three tokens are 'Alice was beginning'
The type of each token is <class 'spacy.tokens.token.Token'>


In [158]:
# Let's explore the objects we've built.
print("The paradise_doc object is a {} object.".format(type(paradise_doc)))
print("It is {} tokens long".format(len(paradise_doc)))
print("The first three tokens are '{}'".format(paradise_doc[:3]))
print("The type of each token is {}".format(type(paradise_doc[0])))

The paradise_doc object is a <class 'spacy.tokens.doc.Doc'> object.
It is 96199 tokens long
The first three tokens are 'Of Man's'
The type of each token is <class 'spacy.tokens.token.Token'>


In [11]:
from collections import Counter

In [160]:
def word_frequencies(text, include_stop=True):
    words = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            words.append(token.text)
    return Counter(words)


In [162]:
alice_freq = word_frequencies(alice_doc).most_common(10)
persuasion_freq = word_frequencies(persuasion_doc).most_common(10)
paradise_freq = word_frequencies(paradise_doc).most_common(10)
print("Alice:",alice_freq)
print("Persuasion",persuasion_freq)
print("Paradise:",paradise_freq)

Alice: [('the', 1524), ('and', 796), ('to', 724), ('a', 611), ('I', 534), ('it', 524), ('she', 508), ('of', 499), ('said', 453), ('Alice', 394)]
Persuasion [('the', 3120), ('to', 2775), ('and', 2738), ('of', 2563), ('a', 1529), ('in', 1346), ('was', 1329), ('had', 1177), ('her', 1159), ('I', 1121)]
Paradise: [('and', 2796), ('the', 2503), ('to', 1756), ('of', 1485), ('in', 1083), ('his', 986), ('with', 874), ('all', 604), ('And', 596), ('I', 587)]


In [165]:
# Use optional keyword to remove stop words
alice_freq = word_frequencies(alice_doc, include_stop=False).most_common(10)
persuasion_freq = word_frequencies(persuasion_doc, include_stop=False).most_common(10)
paradise_freq = word_frequencies(paradise_doc,include_stop=False).most_common(10)
print('Alice:', alice_freq)
print('Persuasion:', persuasion_freq)
print("Paradise:",paradise_freq)

Alice: [('I', 534), ('said', 453), ('Alice', 394), ("n't", 215), ("'s", 190), ('little', 124), ('The', 102), ('like', 84), ('went', 83), ('know', 83)]
Persuasion: [('I', 1121), ('Anne', 497), ("'s", 485), ('She', 326), ('Captain', 297), ('Mrs', 291), ('Elliot', 288), ('Mr', 255), ('He', 225), ('Wentworth', 217)]
Paradise: [('And', 596), ('I', 587), ('Of', 564), ('To', 470), ('The', 462), ('Heaven', 411), ('thou', 382), ('thy', 339), ('thee', 336), ('With', 284)]


In [172]:
# Utility function to calculate how frequently lemas appear in the text.
def lemma_frequencies(text, include_stop=True):
    
    # Build a list of lemas.
    # Strip out punctuation and, optionally, stop words.
    lemmas = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            lemmas.append(token.lemma_)
            
    # Build and return a Counter object containing word counts.
    return Counter(lemmas)

# Instantiate our list of most common lemmas.
alice_lemma_freq = lemma_frequencies(alice_doc, include_stop=False).most_common(10)
persuasion_lemma_freq = lemma_frequencies(persuasion_doc, include_stop=False).most_common(10)
paradise_lemma_freq = lemma_frequencies(paradise_doc, include_stop=False).most_common(10)
print('\nAlice:', alice_lemma_freq)
print('Persuasion:', persuasion_lemma_freq)
print("Paradise:", paradise_lemma_freq)

# Again, identify the lemmas common to one text but not the other.
alice_lemma_common =      [pair[0] for pair in alice_lemma_freq]
persuasion_lemma_common = [pair[0] for pair in persuasion_lemma_freq]
paradise_lemma_common =   [pair[0] for pair in paradise_lemma_freq]

print('Unique to Alice:', set(alice_lemma_common) - set(persuasion_lemma_common))
print('Unique to Persuasion:', set(persuasion_lemma_common) - set(alice_lemma_common))
print("Unique to Paradise:", set(paradise_lemma_common) - set(alice_lemma_common))


Alice: [('-PRON-', 758), ('say', 476), ('alice', 396), ('be', 254), ('not', 231), ('go', 133), ('think', 131), ('little', 126), ('the', 109), ('look', 105)]
Persuasion: [('-PRON-', 2241), ('anne', 497), ("'s", 466), ('captain', 303), ('elliot', 295), ('mrs', 291), ('good', 289), ('know', 258), ('think', 256), ('mr', 255)]
Paradise: [('-PRON-', 1335), ('and', 596), ('of', 564), ('to', 470), ('the', 462), ('thou', 432), ('heaven', 421), ('thy', 414), ('thee', 357), ('god', 297)]
Unique to Alice: {'little', 'not', 'go', 'be', 'say', 'look', 'the', 'alice'}
Unique to Persuasion: {'know', 'captain', 'anne', 'elliot', "'s", 'mrs', 'mr', 'good'}
Unique to Paradise: {'to', 'god', 'and', 'thou', 'heaven', 'thee', 'thy', 'of'}


In [93]:
# Initial exploration of sentences
sentences = list(alice_doc.sents)
persuasion_sentences =list(persuasion_doc.sents)
print("Alice in Wonderland has {} sentences.".format(len(sentences)))

example_sentence = sentences[2]
print("Here is an example: \n{}\n".format(example_sentence))

Alice in Wonderland has 1669 sentences.
Here is an example: 
There was nothing so VERY remarkable in that; nor did Alice think it so VERY much out of the way to hear the Rabbit say to itself, 'Oh dear!



In [94]:
sentences

[Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?',
 So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her.,
 There was nothing so VERY remarkable in that; nor did Alice think it so VERY much out of the way to hear the Rabbit say to itself, 'Oh dear!,
 Oh dear!,
 I shall be late!',
 (when she thought it over afterwards, it occurred to her that she ought to have wondered at this, but at the time it all seemed quite natural); but when the Rabbit actually TOOK A WATCH OUT OF ITS WAISTCOAT-POCKET, and looked at 

In [19]:
#sentence metrics
example_words = [token for token in example_sentence if not token.is_punct]
unique_words = set([token.text for token in example_words])

print(("There are {} words in this sentence, and {} of them are unique").format(len(example_words),len(unique_words)))

There are 29 words in this sentence, and 25 of them are unique


In [20]:
print(nlp("I need a break")[3].pos_)
print(nlp("I need to break the glass")[3].pos_)

NOUN
VERB


In [21]:
print("\nParts of speech:")
for token in example_sentence[:9]:
    print(token.orth_,token.pos_)


Parts of speech:
There ADV
was VERB
nothing NOUN
so ADV
VERY ADV
remarkable ADJ
in ADP
that DET
; PUNCT


In [22]:
print("\nParts of speech:")
for token in example_sentence[:9]:
    print(token.pos_)


Parts of speech:
ADV
VERB
NOUN
ADV
ADV
ADJ
ADP
DET
PUNCT


In [24]:
example_sentence[:9]

There was nothing so VERY remarkable in that;

In [25]:
def text_cleaner(text):
    text = re.sub(r"--", " ", text)
    text = re.sub("[\[].*?[\]]", "",text)
    text = " ".join(text.split())
    return text

# Load data
persuasion = gutenberg.raw("austen-persuasion.txt")
alice = gutenberg.raw("carroll-alice.txt")


# Remove chapter headers

persuasion = re.sub(r"Chapter \d+", "", persuasion)
alice = re.sub(r"CHAPTER .*", "", alice)

alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

In [26]:
# Parse the text

nlp = spacy.load("en")
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [27]:
alice_sentences = sentences

In [177]:
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]
paradise_sents = [[sent, "Milton"] for sent in paradise_doc.sents]

sentences = pd.DataFrame(alice_sents + persuasion_sents)
sentences_2 = pd.DataFrame(alice_sents + paradise_sents)
sentences.head()

Unnamed: 0,0,1
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(I, shall, be, late, !, ')",Carroll


In [180]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df

# Set up the bags.
alicewords = bag_of_words(alice_doc)
persuasionwords = bag_of_words(persuasion_doc)
paradisewords = bag_of_words(paradise_doc)

# Combine bags to create a set of unique words.
common_words = set(alicewords + persuasionwords)
common_words_2 = set(alicewords + paradisewords)

In [31]:
# Create dataframe with features

word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000
Processing row 3500
Processing row 4000
Processing row 4500
Processing row 5000


Unnamed: 0,leg,bid,throat,toffee,reconcile,acquaintance,lick,show,authorise,describe,...,sight,prison,invitation,roast,stupidly,boast,require,originally,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(So, she, was, considering, in, her, own, mind...",Carroll
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Oh, dear, !)",Carroll
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(I, shall, be, late, !, ')",Carroll


In [181]:
word_counts_2 = bow_features(sentences_2, common_words_2)

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000
Processing row 3500
Processing row 4000


In [39]:
adv_count =[]
for sent in alice_sentences:
    for token in sent:
        count=0
        if token.pos_ == "ADV":
            count = count+1
        else:
            count=count
    adv_count.append(count)

In [40]:
adv_count

[0]

In [44]:
alice_sents[0:3]

[[Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?',
  'Carroll'],
 [So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her.,
  'Carroll'],
 [There was nothing so VERY remarkable in that; nor did Alice think it so VERY much out of the way to hear the Rabbit say to itself, 'Oh dear!,
  'Carroll']]

In [47]:
for sent in alice_sents[0:3]:
    print(sent)
    print("step")

[Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?', 'Carroll']
step
[So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her., 'Carroll']
step
[There was nothing so VERY remarkable in that; nor did Alice think it so VERY much out of the way to hear the Rabbit say to itself, 'Oh dear!, 'Carroll']
step


In [95]:
combined_sentences=sentences + persuasion_sentences

In [106]:
adv_count =[]
for sent in combined_sentences:
    count=0
    for token in sent:
        if token.pos_ == "ADV":
            count = count+1
        else:
            count=count
    adv_count.append(count)

In [107]:
verb_count =[]
for sent in combined_sentences:
    count=0
    for token in sent:
        if token.pos_ == "VERB":
            count = count+1
        else:
            count=count
    verb_count.append(count)

In [108]:
noun_count =[]
for sent in combined_sentences:
    count=0
    for token in sent:
        if token.pos_ == "NOUN":
            count = count+1
        else:
            count=count
    noun_count.append(count)

In [109]:
punct_count =[]
for sent in combined_sentences:
    count=0
    for token in sent:
        if token.pos_ == "PUNCT":
            count = count+1
        else:
            count=count
    punct_count.append(count)

In [110]:
adj_count =[]
for sent in combined_sentences:
    count=0
    for token in sent:
        if token.pos_ == "ADJ":
            count = count+1
        else:
            count=count
    adj_count.append(count)

In [60]:
print("\nParts of speech:")
for token in example_sentence[:9]:
    print(token.pos_)


Parts of speech:
ADV
VERB
NOUN
ADV
ADV
ADJ
ADP
DET
PUNCT


In [111]:
len(adv_count)

5318

In [114]:
word_counts["adv_count"] = adv_count
word_counts["verb_count"] = verb_count
word_counts["noun_count"] = noun_count
word_counts["punct_count"] = punct_count
word_counts["adj_count"] = adj_count

In [115]:
word_counts["sentence_length"] = word_counts["text_sentence"].apply(lambda x: len(x))
word_counts["unique_length"] = word_counts["text_sentence"].apply(lambda x: len(set([token.text for token in x])))
word_counts["percent_unique"] = word_counts.unique_length / word_counts.sentence_length

In [116]:
word_counts

Unnamed: 0,leg,bid,throat,toffee,reconcile,acquaintance,lick,show,authorise,describe,...,text_sentence,text_source,adv_count,verb_count,noun_count,punct_count,adj_count,sentence_length,unique_length,percent_unique
0,0,0,0,0,0,0,0,0,0,0,...,"(Alice, was, beginning, to, get, very, tired, ...",Carroll,3,13,12,10,3,67,45,0.671642
1,0,0,0,0,0,0,0,0,0,0,...,"(So, she, was, considering, in, her, own, mind...",Carroll,7,11,8,7,7,63,51,0.809524
2,0,0,0,0,0,0,0,0,0,0,...,"(There, was, nothing, so, VERY, remarkable, in...",Carroll,6,5,2,4,1,33,29,0.878788
3,0,0,0,0,0,0,0,0,0,0,...,"(Oh, dear, !)",Carroll,0,0,0,1,0,3,3,1.000000
4,0,0,0,0,0,0,0,0,0,0,...,"(I, shall, be, late, !, ')",Carroll,0,2,0,2,1,6,6,1.000000
5,0,0,0,0,0,0,0,0,0,0,...,"((, when, she, thought, it, over, afterwards, ...",Carroll,10,19,14,17,5,126,81,0.642857
6,0,0,0,0,0,0,0,0,0,0,...,"(In, another, moment, down, went, Alice, after...",Carroll,5,4,2,2,0,23,23,1.000000
7,0,0,0,0,0,0,0,0,0,0,...,"(The, rabbit, -, hole, went, straight, on, lik...",Carroll,11,7,5,4,1,44,38,0.863636
8,0,0,0,0,0,0,0,0,0,0,...,"(Either, the, well, was, very, deep, ,, or, sh...",Carroll,4,9,4,3,1,37,30,0.810811
9,0,0,0,0,0,0,0,0,0,0,...,"(First, ,, she, tried, to, look, down, and, ma...",Carroll,4,11,7,6,1,49,38,0.775510


# Original Models

In [61]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split

rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.989968652038

Test set score: 0.886278195489


In [62]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(3190, 3062) (3190,)
Training set score: 0.957993730408

Test set score: 0.915883458647


In [63]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.886833855799

Test set score: 0.874060150376


In [65]:
from sklearn.svm import SVC

In [69]:
svc_model = SVC()
train = svc_model.fit(X_train, y_train)

print("Train score:", svc_model.score(X_train,y_train))
print("Test score:", svc_model.score(X_test,y_test))
      

Train score: 0.682445141066
Test score: 0.691729323308


In [70]:
svc_model_weighted = SVC(class_weight="balanced")
train = svc_model_weighted.fit(X_train, y_train)

print("Train score:", svc_model_weighted.score(X_train,y_train))
print("Test score:", svc_model_weighted.score(X_test,y_test))

Train score: 0.720689655172
Test score: 0.707236842105


In [120]:
from sklearn.neighbors import KNeighborsClassifier

In [72]:
knn = KNeighborsClassifier(n_neighbors =3)
knn.fit(X_train, y_train)

print("Train score:", knn.score(X_train,y_train))
print("Test score:", knn.score(X_test,y_test))

Train score: 0.869278996865
Test score: 0.754229323308


# Adjusted Models

In [117]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split

rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.988401253918

Test set score: 0.869360902256


In [118]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(3190, 3070) (3190,)
Training set score: 0.963009404389

Test set score: 0.923402255639


In [144]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty="l2",solver="newton-cg")
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(3190, 3070) (3190,)
Training set score: 0.962695924765

Test set score: 0.923872180451


In [121]:
knn = KNeighborsClassifier(n_neighbors =10)
knn.fit(X_train, y_train)

print("Train score:", knn.score(X_train,y_train))
print("Test score:", knn.score(X_test,y_test))

Train score: 0.777429467085
Test score: 0.742011278195


In [122]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.889968652038

Test set score: 0.869360902256


# Challenge 1

In [147]:
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [183]:
Y_2 = word_counts_2['text_source']
X_2 = np.array(word_counts_2.drop(['text_sentence','text_source'], 1))

X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, Y_2, test_size=0.4, random_state=0)

In [186]:
lr = LogisticRegression(penalty="l2",solver="newton-cg")
train = lr.fit(X_train_2, y_train_2)
print(X_train_2.shape, y_train_2.shape)
print('Training set score:', lr.score(X_train_2, y_train_2))
print('\nTest set score:', lr.score(X_test_2, y_test_2))

(2667, 3266) (2667,)
Training set score: 0.985751781027

Test set score: 0.937570303712
