In [68]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression

In [2]:
df = pd.read_csv('zak_data/clean_dallas.csv')

In [35]:
X = df['text']
y = df['counts']

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=1)

In [28]:
X_train

Unnamed: 0,text
9146,that fucking sack of orange shit has pissed me...
3709,Last goal.....literally the last goal of the y...
4370,Oh Trump definitely has a plan. Eliminate test...
6711,Some are off on covid leave. So they are being...
14475,"The rise in young, black home ownership is dop..."
...,...
10955,@marklevinshow was talking about ur coronaviru...
17289,"""Kamala Harris delivers remarks on COVID-19 pa..."
5192,"If you need masks, gloves, sanitizer etc check..."
12172,First COVID-19 vaccine tested in the US shows ...


In [37]:
cvec = CountVectorizer()
cvec.fit_transform(X_train)
X_train_cvec = cvec.transform(X_train)
X_train_df = pd.DataFrame(X_train_cvec.toarray(),
                          columns=cvec.get_feature_names()
                         )

In [38]:
X_train_df

Unnamed: 0,00,000,0003,00033,000502857142857,000512,0007,001,0027,003_osp,...,ไปท,에이티즈,𝐛𝐞,𝐜𝐥𝐨𝐬𝐞𝐝,𝐟𝐨𝐫,𝐭𝐡𝐞,𝐰𝐞,𝐰𝐢𝐥𝐥,𝔻𝕚𝕞𝕖𝕟𝕤𝕚𝕠𝕟𝕒𝕝,𝕓𝕝𝕠𝕟𝕕𝕖
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13849,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13850,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13851,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13852,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
X_train_trans

<1x1 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [39]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('lr', LinearRegression())
])

pipe_params = {
    
}

gs = GridSearchCV(pipe,
                   param_grid = pipe_params,
                   cv=5)

gs.fit(X_train,y_train)

print(f'Training Score: {gs.score(X_train,y_train)}')
print(f'Testing Score: {gs.score(X_test, y_test)}')

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('lr', LinearRegression())]),
             param_grid={})

In [40]:
display(gs.score(X_train,y_train))

0.9892661197277993

In [42]:
gs.score(X_test, y_test)

-665.5009528573495

In [49]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('lr', LinearRegression())
])

pipe_params = {
# Vectorizer Options
 'cvec__analyzer': ['word'],
 'cvec__binary': [False],
 'cvec__decode_error': ['strict'],
 'cvec__encoding': ['utf-8'],
 'cvec__input': ['content'],
 'cvec__lowercase': [True],
 'cvec__max_df': [1.0],
 'cvec__max_features': [5000],
 'cvec__min_df': [1],
 'cvec__ngram_range': [(1, 1)],
 'cvec__preprocessor': [None],
 'cvec__stop_words': [None],
 'cvec__strip_accents': [None],
 'cvec__token_pattern': ['(?u)\\b\\w\\w+\\b'],
 'cvec__tokenizer': [None],
 'cvec__vocabulary': [None],
# Transformer Options
 'tfidf__norm': ['l2'],
 'tfidf__smooth_idf': [True],
 'tfidf__sublinear_tf': [False],
 'tfidf__use_idf': [True],
# Linear Regression Options
 'lr__copy_X': [True],
 'lr__fit_intercept': [True],
 'lr__n_jobs': [None],
 'lr__normalize': [False]
}

gs = GridSearchCV(pipe,
                   param_grid = pipe_params,
                   cv=5,
                  n_jobs=2,
                  verbose = 2)

gs.fit(X_train,y_train)

print(f'Training Score: {gs.score(X_train,y_train)}')
print(f'Testing Score: {gs.score(X_test, y_test)}')

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    4.3s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    4.3s finished


Training Score: 0.4764089940666961
Testing Score: -0.31565905921133264


In [45]:
pipe.get_params()

{'memory': None,
 'steps': [('cvec', CountVectorizer()),
  ('tfidf', TfidfTransformer()),
  ('lr', LinearRegression())],
 'verbose': False,
 'cvec': CountVectorizer(),
 'tfidf': TfidfTransformer(),
 'lr': LinearRegression(),
 'cvec__analyzer': 'word',
 'cvec__binary': False,
 'cvec__decode_error': 'strict',
 'cvec__dtype': numpy.int64,
 'cvec__encoding': 'utf-8',
 'cvec__input': 'content',
 'cvec__lowercase': True,
 'cvec__max_df': 1.0,
 'cvec__max_features': None,
 'cvec__min_df': 1,
 'cvec__ngram_range': (1, 1),
 'cvec__preprocessor': None,
 'cvec__stop_words': None,
 'cvec__strip_accents': None,
 'cvec__token_pattern': '(?u)\\b\\w\\w+\\b',
 'cvec__tokenizer': None,
 'cvec__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,
 'tfidf__use_idf': True,
 'lr__copy_X': True,
 'lr__fit_intercept': True,
 'lr__n_jobs': None,
 'lr__normalize': False}

In [75]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('MLP', MLPRegressor(hidden_layer_sizes=(3,), activation ='relu',
         solver='adam', alpha=0.001, batch_size='auto',
         learning_rate='adaptive', learning_rate_init=0.01,
         power_t=0.5, max_iter=1000, shuffle=True, random_state=9,
         tol=0.0001, verbose=False, warm_start=False, momentum=0.9,
         nesterovs_momentum=True, early_stopping=False,
         validation_fraction=0.1, beta_1=0.9, beta_2=0.999,
         epsilon=1e-08))
])

pipe_params = {
    'cvec__ngram_range': [(1, 1), (1,2)],
    'cvec__max_features': [1000, 3000, 5000],
    
}

gs = GridSearchCV(pipe,
                   param_grid = pipe_params,
                   cv=5, n_jobs=2, verbose = 2)

gs.fit(X_train,y_train)
print(f'Training Score: {gs.score(X_train,y_train)}')
print(f'Testing Score: {gs.score(X_test, y_test)}')
print(f'Best Params: {gs.best_params_}')

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:  2.1min remaining:    0.0s
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:  2.1min finished


Training Score: 0.2267655171351115
Testing Score: 0.02317202807429042
Best Params: {'cvec__max_features': 1000, 'cvec__ngram_range': (1, 1)}


In [60]:
print(f'Training Score: {gs.score(X_train,y_train)}')
print(f'Testing Score: {gs.score(X_test, y_test)}')

Training Score: 0.16613476370855107
Testing Score: 0.03894265660739582


In [73]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('MLP', MLPRegressor(hidden_layer_sizes=(3,), activation ='relu',
         solver='adam', alpha=0.001, batch_size='auto',
         learning_rate='adaptive', learning_rate_init=0.01,
         power_t=0.5, max_iter=1000, shuffle=True, random_state=9,
         tol=0.0001, verbose=False, warm_start=False, momentum=0.9,
         nesterovs_momentum=True, early_stopping=False,
         validation_fraction=0.1, beta_1=0.9, beta_2=0.999,
         epsilon=1e-08))
])

In [61]:
gs.best_params_

{'cvec__max_features': 1000, 'cvec__ngram_range': (1, 1)}

In [52]:
pipe.get_params()

{'memory': None,
 'steps': [('cvec', CountVectorizer()),
  ('tfidf', TfidfTransformer()),
  ('MLP', LinearRegression())],
 'verbose': False,
 'cvec': CountVectorizer(),
 'tfidf': TfidfTransformer(),
 'MLP': LinearRegression(),
 'cvec__analyzer': 'word',
 'cvec__binary': False,
 'cvec__decode_error': 'strict',
 'cvec__dtype': numpy.int64,
 'cvec__encoding': 'utf-8',
 'cvec__input': 'content',
 'cvec__lowercase': True,
 'cvec__max_df': 1.0,
 'cvec__max_features': None,
 'cvec__min_df': 1,
 'cvec__ngram_range': (1, 1),
 'cvec__preprocessor': None,
 'cvec__stop_words': None,
 'cvec__strip_accents': None,
 'cvec__token_pattern': '(?u)\\b\\w\\w+\\b',
 'cvec__tokenizer': None,
 'cvec__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,
 'tfidf__use_idf': True,
 'MLP__copy_X': True,
 'MLP__fit_intercept': True,
 'MLP__n_jobs': None,
 'MLP__normalize': False}

In [76]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('rf', RandomForestRegressor())
])

pipe_params = {
    'cvec__ngram_range': [(1, 1), (1,2)],
    'cvec__max_features': [1000, 3000, 5000],
    
}

gs = GridSearchCV(pipe,
                   param_grid = pipe_params,
                   cv=5, n_jobs=2, verbose = 2)

gs.fit(X_train,y_train)
print(f'Training Score: {gs.score(X_train,y_train)}')
print(f'Testing Score: {gs.score(X_test, y_test)}')
print(f'Best Params: {gs.best_params_}')

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  30 out of  30 | elapsed: 57.8min finished


Training Score: 0.8596347671415592
Testing Score: 0.09173843797652104
Best Params: {'cvec__max_features': 1000, 'cvec__ngram_range': (1, 1)}
