# Imports

In [1]:
import numpy as np
import pandas as pd
import sklearn
import spacy
import re
import nltk
from nltk.corpus import gutenberg
import gensim
import warnings
warnings.filterwarnings("ignore")

nltk.download('gutenberg')
!python -m spacy download en

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [2]:
# Utility function for standard text cleaning
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation that spaCy doesn't
    # recognize: the double dash --. Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b", " ", text)
    text = ' '.join(text.split())
    return text

# Assignment
Train your own word2vec representations, as you did in the first example in this checkpoint. However, you need to experiment with the hyperparameters of the vectorization step. Modify the hyperparameters and run the classification models again. Can you wrangle any improvements?

In [3]:
# Load and clean the data
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# The chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

In [4]:
# Parse the cleaned novels. This can take some time.
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [5]:
# Group into sentences
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# Combine the sentences from the two novels into one DataFrame
sentences = pd.DataFrame(alice_sents + persuasion_sents, columns = ["text", "author"])
sentences.head()

Unnamed: 0,text,author
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(I, shall, be, late, !, ')",Carroll


In [6]:
# Get rid of stop words and punctuation,
# and lemmatize the tokens
for i, sentence in enumerate(sentences["text"]):
    sentences.loc[i, "text"] = [token.lemma_ for token in sentence if not token.is_punct and not token.is_stop]

# Vectorization

In [16]:
# Train word2vec on the sentences
model = gensim.models.Word2Vec(
    sentences["text"],
    workers=4,
    min_count=1,
    window=6,
    sg=0,
    sample=1e-3,
    size=100,
    hs=1
)

In [17]:
print(model.most_similar(positive=['lady', 'man'], negative=['woman'], topn=5))
print(model.doesnt_match("dad dinner mom aunt uncle".split()))
print(model.similarity('woman', 'man'))
print(model.similarity('horse', 'cat'))

[('hour', 0.9991674423217773), ('come', 0.998889684677124), ('sea', 0.9988483190536499), ('garden', 0.9988287687301636), ('Cottage', 0.9987945556640625)]
aunt
0.9983593
0.9950253


In [20]:
# Train word2vec on the sentences
model = gensim.models.Word2Vec(
    sentences["text"],
    workers=4,
    min_count=1,
    window=8,
    sg=0,
    sample=1e-3,
    size=100,
    hs=1
)

In [21]:
print(model.most_similar(positive=['lady', 'man'], negative=['woman'], topn=5))
print(model.doesnt_match("dad dinner mom aunt uncle".split()))
print(model.similarity('woman', 'man'))
print(model.similarity('horse', 'cat'))

[('party', 0.9991901516914368), ('stand', 0.9991484880447388), ('early', 0.9989441633224487), ('horse', 0.9988605976104736), ('interesting', 0.9987533092498779)]
aunt
0.9954394
0.98536915


In [22]:
# Train word2vec on the sentences
model = gensim.models.Word2Vec(
    sentences["text"],
    workers=4,
    min_count=1,
    window=6,
    sg=0,
    sample=1e-2,
    size=100,
    hs=1
)

In [23]:
print(model.most_similar(positive=['lady', 'man'], negative=['woman'], topn=5))
print(model.doesnt_match("dad dinner mom aunt uncle".split()))
print(model.similarity('woman', 'man'))
print(model.similarity('horse', 'cat'))

[('probably', 0.995675265789032), ('take', 0.9947940707206726), ('style', 0.9947859048843384), ('sneeze', 0.993588387966156), ('party', 0.9935703277587891)]
uncle
0.99427783
0.930716


In [24]:
# Train word2vec on the sentences
model = gensim.models.Word2Vec(
    sentences["text"],
    workers=4,
    min_count=1,
    window=6,
    sg=0,
    sample=1e-4,
    size=100,
    hs=1
)

In [25]:
print(model.most_similar(positive=['lady', 'man'], negative=['woman'], topn=5))
print(model.doesnt_match("dad dinner mom aunt uncle".split()))
print(model.similarity('woman', 'man'))
print(model.similarity('horse', 'cat'))

[('room', 0.9975862503051758), ('old', 0.9975762367248535), ('find', 0.9974665641784668), ('house', 0.9974289536476135), ('consequence', 0.9974139928817749)]
uncle
0.9982257
0.99333894


In [38]:
# Train word2vec on the sentences
model = gensim.models.Word2Vec(
    sentences["text"],
    workers=4,
    min_count=1,
    window=5,
    sg=0,
    sample=1e-3,
    size=100,
    hs=1
)

In [39]:
print(model.most_similar(positive=['lady', 'man'], negative=['woman'], topn=5))
print(model.doesnt_match("dad dinner mom aunt uncle".split()))
print(model.similarity('woman', 'man'))
print(model.similarity('horse', 'cat'))

[('large', 0.999431312084198), ('wife', 0.9989168643951416), ('spirit', 0.9987666606903076), ('warm', 0.9987398982048035), ('comfort', 0.9986565709114075)]
aunt
0.99716294
0.9954219


In [40]:
word2vec_arr = np.zeros((sentences.shape[0],100))

for i, sentence in enumerate(sentences["text"]):
    word2vec_arr[i,:] = np.mean([model[lemma] for lemma in sentence], axis=0)

word2vec_arr = pd.DataFrame(word2vec_arr)
sentences = pd.concat([sentences[["author", "text"]],word2vec_arr], axis=1)
sentences.dropna(inplace=True)

sentences.head()

Unnamed: 0,author,text,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,Carroll,"[Alice, begin, tired, sit, sister, bank, have,...",0.048,-0.187704,0.347052,0.35255,-0.101329,-0.119254,-0.260452,0.366433,0.262457,0.597434,-0.048809,0.025462,-0.262537,0.414285,-0.016715,0.266028,0.130056,-0.09148,0.087965,-0.1481,-0.028312,0.002004,-0.197226,0.169833,0.237772,0.084147,-0.330879,-0.018147,-0.046863,0.180527,-0.012896,-0.050256,0.0043,-0.159269,0.237287,0.092049,-0.205038,-0.093129,...,-0.257731,0.171465,-0.302378,-0.324319,-0.199457,-0.120065,-0.028214,-0.052656,0.157933,0.520118,0.116002,0.046345,-0.012452,-0.032985,-0.047465,-0.020088,0.004033,0.270443,-0.377954,0.369348,-0.066029,0.06312,-0.414351,0.42077,-0.372577,-0.217826,0.296333,-0.035934,0.018498,0.005265,-0.028593,-0.032186,0.217536,0.100305,0.234729,-0.098482,-0.264533,0.07237,0.297547,-0.019668
1,Carroll,"[consider, mind, hot, day, feel, sleepy, stupi...",0.028637,-0.143616,0.278461,0.277434,-0.077205,-0.094961,-0.203207,0.295077,0.206409,0.47606,-0.042261,0.017701,-0.208692,0.334784,-0.003024,0.21078,0.105593,-0.072175,0.070914,-0.118777,-0.032873,-0.001665,-0.165321,0.135812,0.198908,0.067158,-0.269467,-0.01726,-0.035316,0.136428,-0.004193,-0.034832,0.003719,-0.123044,0.186484,0.076487,-0.16387,-0.07249,...,-0.201265,0.132001,-0.241908,-0.258963,-0.157889,-0.096313,-0.023784,-0.042627,0.129051,0.415437,0.092375,0.029581,-0.014867,-0.028881,-0.042388,-0.009877,0.007138,0.215218,-0.304339,0.284749,-0.046607,0.047721,-0.320104,0.324702,-0.293449,-0.172731,0.24422,-0.040395,0.011228,0.004389,-0.020348,-0.024065,0.176421,0.0754,0.185248,-0.072021,-0.220747,0.057374,0.240672,-0.022118
2,Carroll,"[remarkable, Alice, think, way, hear, Rabbit, ...",0.078226,-0.231144,0.424738,0.433419,-0.135063,-0.150943,-0.315627,0.444615,0.319292,0.713312,-0.050013,0.037581,-0.324206,0.491446,-0.023936,0.312761,0.15509,-0.117786,0.102776,-0.17902,-0.018815,0.009269,-0.236493,0.201043,0.279777,0.098908,-0.387835,-0.019561,-0.05454,0.231978,-0.018572,-0.066042,-0.001379,-0.194701,0.29098,0.088063,-0.24162,-0.10982,...,-0.322329,0.209217,-0.363493,-0.393473,-0.242276,-0.14203,-0.021504,-0.06174,0.188295,0.62335,0.140418,0.063806,-0.007322,-0.039991,-0.055212,-0.035048,-0.004581,0.331123,-0.459101,0.455266,-0.081682,0.079585,-0.511063,0.5237,-0.461552,-0.26742,0.344113,-0.031608,0.013197,0.011332,-0.039577,-0.043046,0.253098,0.117555,0.279501,-0.123866,-0.309077,0.090107,0.354641,-0.013576
3,Carroll,"[oh, dear]",0.067305,-0.172221,0.356054,0.341043,-0.101557,-0.132718,-0.22357,0.349203,0.244916,0.528414,-0.036462,0.024717,-0.267669,0.374389,0.007332,0.23881,0.118314,-0.097823,0.075837,-0.157954,-0.031368,0.021877,-0.189552,0.153848,0.248424,0.086758,-0.301894,-0.007183,-0.033442,0.185841,-0.004225,-0.04244,-0.009377,-0.154178,0.22029,0.056979,-0.187647,-0.08004,...,-0.25784,0.157914,-0.272796,-0.306883,-0.181569,-0.099395,-0.009606,-0.036041,0.137915,0.495135,0.108748,0.037299,0.001662,-0.036174,-0.04244,-0.032943,-0.000778,0.26355,-0.365208,0.344609,-0.045369,0.053893,-0.382657,0.415819,-0.371559,-0.199985,0.268662,-0.009539,-0.00168,0.013214,-0.036683,-0.021286,0.176651,0.073824,0.21818,-0.092009,-0.255284,0.078553,0.273098,-0.01246
4,Carroll,"[shall, late]",0.036554,-0.128396,0.243184,0.249459,-0.071475,-0.083279,-0.182885,0.260272,0.177184,0.413054,-0.038989,0.01048,-0.182981,0.290358,-0.01085,0.181951,0.09665,-0.061323,0.054553,-0.106745,-0.020145,0.004751,-0.141043,0.118108,0.170017,0.062201,-0.235599,-0.015271,-0.029049,0.124873,-0.005625,-0.039979,0.000178,-0.108175,0.164856,0.059768,-0.139928,-0.062678,...,-0.183925,0.118869,-0.210392,-0.22483,-0.13841,-0.085344,-0.014477,-0.037121,0.1177,0.352185,0.08144,0.024811,-0.011285,-0.026287,-0.030095,-0.014947,0.000544,0.190926,-0.262653,0.259266,-0.035862,0.042064,-0.287016,0.291218,-0.259948,-0.152244,0.204379,-0.030089,0.003313,0.009476,-0.017117,-0.022798,0.145846,0.068606,0.159695,-0.06107,-0.189,0.044729,0.207115,-0.008538


# Modeling

In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split

Y = sentences['author']
X = np.array(sentences.drop(['text','author'], 1))

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=123)

# Models
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)

print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))

----------------------Logistic Regression Scores----------------------
Training set score: 0.7197231833910035

Test set score: 0.7101167315175098
----------------------Random Forest Scores----------------------
Training set score: 0.9913494809688581

Test set score: 0.72568093385214
----------------------Gradient Boosting Scores----------------------
Training set score: 0.8784602076124568

Test set score: 0.7237354085603113


In [43]:
# Load Google's pretrained word2vec model.
model_pretrained = gensim.models.KeyedVectors.load_word2vec_format(
    'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [44]:
word2vec_arr = np.zeros((sentences.shape[0],300))

for i, sentence in enumerate(sentences["text"]):
  try:
    word2vec_arr[i,:] = np.mean([model_pretrained[lemma] for lemma in sentence], axis=0)
  except KeyError:
    word2vec_arr[i,:] = np.full((1,300), np.nan)
    continue

word2vec_arr = pd.DataFrame(word2vec_arr)
sentences = pd.concat([sentences[["author", "text"]],word2vec_arr], axis=1)
sentences.dropna(inplace=True)

print("Shape of the dataset: {}".format(sentences.shape))
sentences.head()

Shape of the dataset: (2883, 302)


Unnamed: 0,author,text,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,...,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
0,Carroll,"[Alice, begin, tired, sit, sister, bank, have,...",0.046265,0.016199,-0.036288,0.08241,-0.010284,0.015515,0.005437,-0.035947,0.067871,0.040186,0.002303,-0.071809,-0.002277,0.035602,-0.087659,0.067581,0.083479,0.109125,0.038206,-0.112296,0.021118,0.067197,0.000285,-0.046423,0.030869,0.000274,-0.084494,0.078152,0.047164,-0.023407,-0.105367,-0.03999,-0.110767,-0.065475,0.023956,0.010934,0.094955,0.015027,...,-0.055796,0.055115,-0.117415,-0.030527,-0.015355,0.163165,-0.034854,0.015172,-0.106117,0.035062,0.086723,0.159433,0.103741,0.062915,0.097021,-0.047644,-0.026648,-0.071558,0.01808,-0.039431,0.121521,-0.125867,0.006816,0.029865,0.046413,0.018112,-0.087307,0.042181,-0.015435,0.128412,-0.066516,0.029852,-0.042609,-0.044208,-0.056998,-0.063269,0.000244,-0.085071,-0.00034,-0.064371
1,Carroll,"[consider, mind, hot, day, feel, sleepy, stupi...",0.046331,0.020463,-0.002012,0.101565,-0.066478,-0.035698,0.045293,-0.068695,0.04405,0.079996,0.010562,-0.09824,-0.024309,0.042576,-0.078658,0.026042,-0.025208,0.128391,0.054481,-0.081564,-0.022604,0.060187,0.014813,-0.00264,0.089216,0.010905,-0.080477,0.078742,0.071459,-0.042953,-0.011639,0.026516,-0.042924,-0.028997,-0.010134,-0.033885,0.051852,0.018926,...,0.037855,0.004276,-0.073813,0.033909,0.053077,0.063299,-0.044852,-0.004278,-0.053132,-0.035156,0.04793,0.12634,0.125036,0.04657,0.049766,-0.076279,-0.069141,-0.122912,-0.052948,0.055787,0.081729,0.011096,0.005422,0.050716,-0.050148,-0.008294,-0.072707,-0.002824,0.021307,0.035784,0.05594,0.085838,-0.067052,-0.013628,-0.027802,-0.033665,-0.023586,0.00962,0.030316,0.000908
2,Carroll,"[remarkable, Alice, think, way, hear, Rabbit, ...",0.061646,-0.006958,-0.013023,0.147003,-0.052933,-0.077866,0.033997,-0.06189,0.104706,0.151611,-0.083191,-0.102318,-0.043243,-0.060654,-0.060211,0.105164,0.127869,0.207825,-0.009186,0.009155,0.005402,0.077332,0.129974,-0.026632,0.149017,0.04354,-0.082504,0.020443,0.117149,-0.014988,-0.064789,-0.023331,-0.06897,0.002205,0.015739,0.018581,0.110168,0.057068,...,-0.073837,-0.021027,0.002594,0.025757,-0.004457,0.067825,-0.060242,-0.063232,-0.079094,0.098316,0.021147,0.124046,0.078278,0.056248,0.099792,-0.106703,0.034882,-0.111328,-0.009624,-0.011642,0.088547,-0.059265,-0.041046,0.069794,-0.002939,0.018978,-0.025116,-0.057938,0.007706,0.120476,-0.006882,0.030754,-0.073837,-0.010359,-0.086411,-0.156464,-0.000771,-0.000549,-0.003784,0.029114
3,Carroll,"[oh, dear]",0.073975,0.134277,0.141357,0.256348,-0.147949,0.09967,0.077148,-0.093628,0.108887,0.281738,-0.201172,-0.020752,-0.266602,0.000732,-0.036865,0.294434,0.158203,0.287109,-0.114624,0.03833,0.141357,-0.046021,0.407227,0.047852,0.322266,0.213379,-0.090576,0.022812,0.171265,-0.283203,0.193848,0.092285,-0.122803,0.02977,-0.116943,0.026123,0.137451,0.055298,...,-0.014648,0.112793,0.071716,-0.133911,-0.091553,-0.079041,-0.15625,-0.029053,-0.024719,0.102844,-0.084473,0.163086,-0.031738,-0.084473,0.14917,-0.082031,-0.023438,-0.199219,-0.253418,0.206055,0.160156,-0.05603,-0.138184,0.208496,0.030762,0.033447,-0.06189,-0.022461,-0.14624,-0.032959,0.058228,0.000854,-0.094971,-0.052668,-0.091919,-0.142456,-0.053711,-0.112671,-0.148193,0.186798
4,Carroll,"[shall, late]",0.095215,0.084473,0.206787,0.211182,0.043579,-0.155762,0.088379,-0.038574,0.065613,0.001221,-0.144287,0.001465,-0.000771,0.189453,-0.05835,-0.062134,0.045898,0.130127,0.211426,0.074341,-0.056122,-0.111145,0.104355,0.069946,0.191895,0.057404,-0.003906,0.107666,-0.040039,0.082275,-0.046707,-0.150635,-0.006226,0.04895,-0.088745,0.088501,-0.081573,-0.180542,...,0.133301,0.074219,0.049438,0.092743,0.077618,0.084229,-0.100586,-0.022217,0.043579,-0.029785,0.212158,0.073242,0.10022,0.062256,0.16748,0.010693,-0.139923,-0.013805,-0.127014,0.001465,-0.120972,0.06308,-0.024597,0.027847,0.010254,-0.073547,0.100098,0.023438,0.107178,0.065918,-0.021667,-0.103516,-0.038578,-0.007385,0.020264,0.134155,-0.177246,-0.254639,-0.212158,0.087646


In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split

Y = sentences['author']
X = np.array(sentences.drop(['text','author'], 1))

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=123)

# Models
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)

print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))

----------------------Logistic Regression Scores----------------------
Training set score: 0.8617698091382302

Test set score: 0.8110918544194108
----------------------Random Forest Scores----------------------
Training set score: 0.9930595720069404

Test set score: 0.7686308492201039
----------------------Gradient Boosting Scores----------------------
Training set score: 0.9710815500289185

Test set score: 0.7651646447140381
