# Sandbox

Notebook used as a sandbox of work

In [115]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression
import scipy as sp

In [83]:
base_path = "./data/stanford_imdb"
def read_df():
    df = pd.read_csv(f'{base_path}/imdb_df.csv.gzip', compression='gzip')
    return df


In [84]:
df= read_df()

In [85]:
df.head()

Unnamed: 0.1,Unnamed: 0,review,sentiment
0,0,I went and saw this movie last night after bei...,1
1,1,Actor turned director Bill Paxton follows up h...,1
2,2,As a recreational golfer with some knowledge o...,1
3,3,"I saw this film in a sneak preview, and it is ...",1
4,4,Bill Paxton has taken the true story of the 19...,1


In [86]:
df = df.drop(['Unnamed: 0'],axis=1)

In [87]:
df.head()

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1


In [88]:
df.groupby('sentiment').describe()

Unnamed: 0_level_0,review,review,review,review
Unnamed: 0_level_1,count,unique,top,freq
sentiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,25000,24698,"When i got this movie free from my job, along ...",3
1,25000,24884,Loved today's show!!! It was a variety and not...,5


In [89]:
from transformers.LengthTransformer import TextLengthExtractor
length_tran = TextLengthExtractor()

In [90]:
df['review_length'] = length_tran.transform(df)

In [91]:
df.head()

Unnamed: 0,review,sentiment,review_length
0,I went and saw this movie last night after bei...,1,794
1,Actor turned director Bill Paxton follows up h...,1,2004
2,As a recreational golfer with some knowledge o...,1,1434
3,"I saw this film in a sneak preview, and it is ...",1,682
4,Bill Paxton has taken the true story of the 19...,1,1216


In [92]:
from transformers.LengthTransformer import AverageWordLengthExtractor
avg_word_len_tran = AverageWordLengthExtractor()

In [93]:
df['avg_word_len'] = avg_word_len_tran.transform(df)

In [94]:
df.head()

Unnamed: 0,review,sentiment,review_length,avg_word_len
0,I went and saw this movie last night after bei...,1,794,4.230263
1,Actor turned director Bill Paxton follows up h...,1,2004,4.828488
2,As a recreational golfer with some knowledge o...,1,1434,4.881148
3,"I saw this film in a sneak preview, and it is ...",1,682,4.508065
4,Bill Paxton has taken the true story of the 19...,1,1216,4.936585


In [95]:
X = df.drop(['sentiment'], axis=1)
y = df['sentiment']

In [112]:
X.head()

Unnamed: 0,review,review_length,avg_word_len
0,I went and saw this movie last night after bei...,794,4.230263
1,Actor turned director Bill Paxton follows up h...,2004,4.828488
2,As a recreational golfer with some knowledge o...,1434,4.881148
3,"I saw this film in a sneak preview, and it is ...",682,4.508065
4,Bill Paxton has taken the true story of the 19...,1216,4.936585


In [96]:
# Divide data into a holdout set used after all training and testing 
# and another set that will be further split into train/test
X_full, X_holdout, y_full, y_holdout = train_test_split(X, y, test_size=0.3, shuffle=True, stratify=y, random_state=222 )


In [97]:
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.3, shuffle=True, stratify=y_full, random_state=222 )

In [98]:
X.shape

(50000, 3)

In [99]:
y.shape

(50000,)

In [100]:
X_train.shape

(24500, 3)

In [101]:
y_train.shape

(24500,)

In [102]:
X_holdout.shape

(15000, 3)

In [103]:
y_holdout.shape

(15000,)

In [104]:
tfidf = TfidfVectorizer(stop_words='english', min_df=2, max_df=0.8, ngram_range=(1,4))


In [105]:
X_train_dtm = tfidf.fit_transform(X_train['review'])

In [106]:
X_train_dtm.shape

(24500, 398621)

In [107]:
type(X_train_dtm)

scipy.sparse.csr.csr_matrix

In [108]:
X_train.head()

Unnamed: 0,review,review_length,avg_word_len
44879,"""Carriers"" follows the exploits of two guys an...",4687,4.904282
4737,"It must be said that the director of The Cell,...",2381,4.881481
34193,I just saw this film in Santa Barbara. My frie...,706,3.944056
44163,I thought the movie (especially the plot) need...,3457,4.668852
36639,I'd read about FLAVIA THE HERETIC for many yea...,3509,4.744681


In [109]:
#X_train['review_vect'] = list(X_train_dtm.toarray())

In [80]:
X_train.head()

Unnamed: 0,review,review_length,avg_word_len,review_vect
44879,"""Carriers"" follows the exploits of two guys an...",4687,4.904282,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4737,"It must be said that the director of The Cell,...",2381,4.881481,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
34193,I just saw this film in Santa Barbara. My frie...,706,3.944056,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
44163,I thought the movie (especially the plot) need...,3457,4.668852,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
36639,I'd read about FLAVIA THE HERETIC for many yea...,3509,4.744681,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [110]:
cv = StratifiedShuffleSplit(n_splits=3, test_size=0.2)



In [111]:

scores = cross_val_score(LogisticRegression(C=100), X_train_dtm, y_train, cv=cv, scoring='accuracy')
print(scores, scores.mean())




[0.89346939 0.89285714 0.90183673] 0.8960544217687075


## Combine the Document Term Matrix with the Manually created Features

This sections shows how to combine the sparse matrix of the Document Term Matrix, with a DataFrame of manual features.

This is not the best way to do this - the next section of the notebook will talk about using a FeatureUnion.

However - this technique can be useful. 

The idea is that you always want to work with a sparse matrix and NOT convert a sparse matrix into a dense matrix.


In [113]:
X_manual_features = X_train.loc[:, ['review_length', 'avg_word_len']]

In [114]:
X_manual_features.head()

Unnamed: 0,review_length,avg_word_len
44879,4687,4.904282
4737,2381,4.881481
34193,706,3.944056
44163,3457,4.668852
36639,3509,4.744681


Create a sparse matrix from the manual features dataframe

In [116]:
X_manual_features_sparse = sp.sparse.csr_matrix(X_manual_features)

In [117]:
type(X_manual_features_sparse)

scipy.sparse.csr.csr_matrix

In [118]:
# combine the two sparse matrices
# combine the document term matrix from the TFIDF with the manual features columns
X_manual_features_dtm = sp.sparse.hstack([X_train_dtm, X_manual_features_sparse])
X_manual_features_dtm.shape

(24500, 398623)

In [119]:
scores = cross_val_score(LogisticRegression(C=100), X_manual_features_dtm, y_train, cv=cv, scoring='accuracy')
print(scores, scores.mean())





[0.8944898  0.89061224 0.90020408] 0.8951020408163265
