# 2. Natural Language Processing

This notebook preprocesses and develops pipelines to classify posts from the two subreddits using classification models.

**Preprocessing and Modeling**
- Is text data successfully converted to a matrix representation?
- Are methods such as stop words, stemming, and lemmatization explored?
- Does the student properly split and/or sample the data for validation/training purposes?
- Does the student test and evaluate a variety of models to identify a production algorithm (**AT MINIMUM:** two classification models, **BONUS:** try a Naive Bayes)?
- Does the student defend their choice of production model relevant to the data at hand and the problem?
- Does the student explain how the model works and evaluate its performance successes/downfalls?

#### Imports

In [278]:
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import WordNetLemmatizer

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB

In [3]:
%store -r df stop_words

#### Make train and test sets out of the train data

In [4]:
X = df['selftext']
y = df['subreddit']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42,stratify=y)

In [6]:
y_test.value_counts()

Divorce            997
weddingplanning    987
Name: subreddit, dtype: int64

In [7]:
print(f'Baseline accuracy is {round(y_test.value_counts()[0]*100/len(y_test),1)}%.')

Baseline accuracy is 50.3%.


#### Encode subreddit

In [8]:
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

#### Create Lemmatizer

In [9]:
def lemma_tokenizer(doc):  ## Used class notes
    wnl = WordNetLemmatizer()
    return [w for w in word_tokenize(doc)]

### Pipelines

#### Logistic Regression Pipeline

In [22]:
# First pass
pipe_lr = Pipeline([
    ('cv', CountVectorizer()),
    ('lr', LogisticRegression(max_iter = 10_000))
])
pipe_lr.fit(X_train, y_train_enc)
pipe_lr.score(X_train, y_train_enc), pipe_lr.score(X_test, y_test_enc)

(0.939149436880148, 0.9112903225806451)

GridSearch

In [86]:
params = {
    'cv__tokenizer': [None,lemma_tokenizer]
}

In [87]:
gs_lr = GridSearchCV(
    pipe_lr, param_grid=params,n_jobs=-1)

In [88]:
gs_lr.fit(X_train,y_train_enc);

In [89]:
gs_lr.best_params_

{'cv__tokenizer': None}

In [90]:
gs_lr.best_score_

0.9093972054759666

In [91]:
gs_lr.cv_results_

{'mean_fit_time': array([ 6.39970903, 19.65938621]),
 'std_fit_time': array([0.16836814, 1.16669374]),
 'mean_score_time': array([0.98642325, 3.49531579]),
 'std_score_time': array([0.09329038, 0.7842291 ]),
 'param_cv__tokenizer': masked_array(data=[None, <function lemma_tokenizer at 0x7fa0b02d85e0>],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'cv__tokenizer': None},
  {'cv__tokenizer': <function __main__.lemma_tokenizer(doc)>}],
 'split0_test_score': array([0.91092437, 0.89663866]),
 'split1_test_score': array([0.89663866, 0.89663866]),
 'split2_test_score': array([0.91596639, 0.91428571]),
 'split3_test_score': array([0.91008403, 0.90588235]),
 'split4_test_score': array([0.91337258, 0.90748528]),
 'mean_test_score': array([0.90939721, 0.90418613]),
 'std_test_score': array([0.00670138, 0.00677779]),
 'rank_test_score': array([1, 2], dtype=int32)}

I will feed this back into my model.

I will feed the results back into my model. Results:
- best_params = {'cv__min_df': 4, 'cv__ngram_range': (1, 3)} for params = {
    'cv__ngram_range':[(1,1),(1,2),(1,3),(1,4),(2,2),(2,3),(2,4)],
    'cv__min_df':np.arange(2,9)
}
- best_params = {'lr__C': 0.2} for params = {
    'lr__C': [1/20, 1/10, 1/5, 1/2, 1, 2, 5]
}
- no impact for params = {
    'cv__max_df': [0.4,0.5,0.6,0.7,0.8,0.9,1.0]
}
- best_params = {'cv__strip_accents': 'unicode'} (actually tie with None, but quicker) for params = {
    'cv__strip_accents': [None,'ascii', 'unicode']
}
- best_params = 'cv__tokenizer': None} for params = {
    'cv__tokenizer': [None,lemma_tokenizer]
}

In [92]:
# Best logistic model using gridsearch
pipe_lr = Pipeline([
    ('cv', CountVectorizer(stop_words=stop_words,ngram_range=(1,3),min_df=4,max_df=1.0,strip_accents='unicode')),
    ('lr', LogisticRegression(C=1/5))
])
pipe_lr.fit(X_train, y_train_enc)
pipe_lr.score(X_train, y_train_enc), pipe_lr.score(X_test, y_test_enc)

(0.9324256177508825, 0.9143145161290323)

#### Decision Tree Pipeline

In [173]:
pipe_dt = Pipeline([
    ('cv', CountVectorizer(stop_words=stop_words,ngram_range=(1,2),min_df=4)),
    ('dt', DecisionTreeClassifier(max_depth=None,criterion='entropy',min_samples_split=3))
])
pipe_dt.fit(X_train, y_train_enc)
pipe_dt.score(X_train, y_train_enc), pipe_dt.score(X_test, y_test_enc)

(0.9384770549672213, 0.8770161290322581)

In [174]:
pipe_dt.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'cv', 'dt', 'cv__analyzer', 'cv__binary', 'cv__decode_error', 'cv__dtype', 'cv__encoding', 'cv__input', 'cv__lowercase', 'cv__max_df', 'cv__max_features', 'cv__min_df', 'cv__ngram_range', 'cv__preprocessor', 'cv__stop_words', 'cv__strip_accents', 'cv__token_pattern', 'cv__tokenizer', 'cv__vocabulary', 'dt__ccp_alpha', 'dt__class_weight', 'dt__criterion', 'dt__max_depth', 'dt__max_features', 'dt__max_leaf_nodes', 'dt__min_impurity_decrease', 'dt__min_samples_leaf', 'dt__min_samples_split', 'dt__min_weight_fraction_leaf', 'dt__random_state', 'dt__splitter'])

In [175]:
params_dt = {
     'cv__max_df': [0.4,0.5,0.6,0.7,0.8,0.9,1.0] 
}

In [176]:
gs_dt = GridSearchCV(pipe_dt, param_grid = params_dt, n_jobs=-1)

In [177]:
gs_dt.fit(X_train,y_train_enc);

In [178]:
gs_dt.best_params_

{'cv__max_df': 0.5}

In [179]:
gs_dt.best_score_

0.8793062456269304

In [180]:
gs_dt.cv_results_

{'mean_fit_time': array([4.118434  , 4.06739755, 3.98271585, 4.56136899, 4.90900517,
        5.6661056 , 3.59714231]),
 'std_fit_time': array([0.04455259, 0.15831547, 0.08430205, 0.27434803, 0.33933838,
        0.0706631 , 1.65485582]),
 'mean_score_time': array([0.55636244, 0.57893558, 0.64267879, 0.70341001, 0.78846087,
        0.76561017, 0.46437607]),
 'std_score_time': array([0.02544631, 0.06927434, 0.03664722, 0.04626765, 0.06456284,
        0.03674529, 0.18843309]),
 'param_cv__max_df': masked_array(data=[0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
              mask=[False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'cv__max_df': 0.4},
  {'cv__max_df': 0.5},
  {'cv__max_df': 0.6},
  {'cv__max_df': 0.7},
  {'cv__max_df': 0.8},
  {'cv__max_df': 0.9},
  {'cv__max_df': 1.0}],
 'split0_test_score': array([0.87647059, 0.87142857, 0.86806723, 0.8697479 , 0.87310924,
        0.87226891, 0.86890756]),
 'split1_test_score': array([

I will feed the results back into my model. Results:
- best_params = {'dt__criterion': 'entropy'} for params_dt = {
    'dt__criterion': ["gini", "entropy"]
}
- best_params = {'dt__criterion': 'entropy', 'dt__min_samples_split': 4} for params_dt = {
    'dt__criterion': ["gini", "entropy"],
    'dt__min_samples_split': [2,4,6],
}
- best_params = {'dt__criterion': 'entropy', 'dt__min_samples_split': 3} for params_dt = {
    'dt__criterion': ["entropy"],
    'dt__min_samples_split': [3,4,5],
}
- best_params = {'cv__min_df': 4, 'cv__ngram_range': (1, 2)} for params_dt = {
    'cv__ngram_range':[(1,1),(1,2),(1,3),(1,4),(2,2),(2,3),(2,4)], 'cv__min_df':np.arange(2,9) 
}
- best_params = {'cv__max_df': 0.5} for params_dt = {
     'cv__max_df': [0.4,0.5,0.6,0.7,0.8,0.9,1.0] 
}

In [187]:
# Best Model
pipe_dt = Pipeline([
    ('cv', CountVectorizer(stop_words=stop_words,ngram_range=(1,2),min_df=4,max_df=0.5)),
    ('dt', DecisionTreeClassifier(max_depth=None,criterion='entropy',min_samples_split=3))
])
pipe_dt.fit(X_train, y_train_enc)
pipe_dt.score(X_train, y_train_enc), pipe_dt.score(X_test, y_test_enc)

(0.9384770549672213, 0.8780241935483871)

#### K Nearest Neighbors Pipeline

In [195]:
pipe_knn = Pipeline([
    ('cv', CountVectorizer(stop_words=stop_words,ngram_range=(1,3),min_df=4,max_df=1.0,strip_accents='unicode')),
    ('knn', KNeighborsClassifier(n_neighbors=5,p=2))
])
pipe_knn.fit(X_train, y_train_enc)
pipe_knn.score(X_train, y_train_enc), pipe_knn.score(X_test, y_test_enc)

(0.7959320894267944, 0.688508064516129)

GridSearch

In [None]:
KNeighborsClassifier(

In [196]:
params_knn = {
    'cv__ngram_range':[(1,1),(1,2),(1,3),(1,4),(2,2),(2,3),(2,4)], 'cv__min_df':np.arange(2,9) 
}

In [197]:
gs_knn = GridSearchCV(
    pipe_knn, param_grid=params_knn,n_jobs=-1)

In [198]:
gs_knn.fit(X_train,y_train_enc);

In [199]:
gs_knn.best_params_

{'cv__min_df': 3, 'cv__ngram_range': (1, 1)}

In [200]:
gs_knn.best_score_

0.7123807167947078

In [201]:
gs_knn.cv_results_

{'mean_fit_time': array([ 2.12323856,  4.16683111,  5.89952655,  8.09805994,  3.84587107,
         6.03389306,  8.22604022,  3.02478766,  5.00029564,  7.11949663,
         9.74562941,  4.63361645,  6.71892247, 10.11451302,  3.83520718,
         5.50885997,  7.87878232, 10.2466238 ,  4.77153378,  7.07806482,
         9.97503214,  3.79238582,  5.61771998,  8.09596238, 10.49108057,
         4.83726583,  7.01719527,  9.36247749,  3.49472523,  5.52764211,
         7.94837661, 10.724475  ,  4.94969053,  7.02651815,  9.25191865,
         3.50314326,  5.58030639,  7.74292116, 10.1194838 ,  4.87931461,
         8.25374603, 10.859974  ,  3.70433235,  5.36447678,  7.59529076,
         9.7388134 ,  4.49696131,  6.69355206,  7.1257195 ]),
 'std_fit_time': array([0.05332698, 0.18954436, 0.0491649 , 0.06328984, 0.067586  ,
        0.08388695, 0.11230609, 0.06009406, 0.08230781, 0.08148116,
        0.1919078 , 0.14482546, 0.08509866, 0.08405127, 0.43405699,
        0.1661816 , 0.061999  , 0.12847497, 

I will feed the results back into my model. Results:
- best_params = {'knn__n_neighbors': 5} for params_knn = {'knn__n_neighbors':np.arange(2,15)}
- best_params = {'knn__n_neighbors': 5, 'knn__p': 2} for params_knn = {
    'knn__n_neighbors':np.arange(3,7),
    'knn__p': [1,2]
}
- no impact for params_knn = {
     'cv__max_df': [0.4,0.5,0.6,0.7,0.8,0.9,1.0] 
}
- best_params = {'cv__min_df': 3, 'cv__ngram_range': (1, 1)} for params_knn = {
    'cv__ngram_range':[(1,1),(1,2),(1,3),(1,4),(2,2),(2,3),(2,4)], 'cv__min_df':np.arange(2,9) 
}

In [202]:
pipe_knn = Pipeline([
    ('cv', CountVectorizer(stop_words=stop_words,ngram_range=(1,1),min_df=3,max_df=1.0,strip_accents='unicode')),
    ('knn', KNeighborsClassifier(n_neighbors=5,p=2))
])
pipe_knn.fit(X_train, y_train_enc)
pipe_knn.score(X_train, y_train_enc), pipe_knn.score(X_test, y_test_enc)

(0.7656749033451, 0.6854838709677419)

#### Random Forest Pipeline

In [245]:
pipe_rf = Pipeline([
    ('cv', CountVectorizer(stop_words=stop_words,ngram_range=(1,3),min_df=8,strip_accents = 'unicode')),
    ('rf', RandomForestClassifier(max_depth=None,n_estimators=500))
])
pipe_rf.fit(X_train, y_train_enc)
pipe_rf.score(X_train, y_train_enc), pipe_rf.score(X_test, y_test_enc)

(0.9384770549672213, 0.9087701612903226)

In [238]:
params_rf = {
    'cv__min_df':np.arange(7,11)
}

In [239]:
gs_rf = GridSearchCV(pipe_rf, param_grid = params_rf, n_jobs=-1)

In [240]:
gs_rf.fit(X_train,y_train_enc);

In [241]:
gs_rf.best_params_

{'cv__min_df': 7}

In [242]:
gs_rf.best_score_

0.9072123315263868

In [243]:
gs_rf.cv_results_

{'mean_fit_time': array([51.78116174, 53.25754938, 55.80787764, 35.65900488]),
 'std_fit_time': array([0.76492955, 2.22882995, 0.52410677, 9.77487916]),
 'mean_score_time': array([3.32539935, 3.0655519 , 2.98112411, 1.93132234]),
 'std_score_time': array([0.16342747, 0.22127027, 0.08662534, 0.50520455]),
 'param_cv__min_df': masked_array(data=[7, 8, 9, 10],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'cv__min_df': 7},
  {'cv__min_df': 8},
  {'cv__min_df': 9},
  {'cv__min_df': 10}],
 'split0_test_score': array([0.90756303, 0.90840336, 0.90840336, 0.90420168]),
 'split1_test_score': array([0.89663866, 0.89747899, 0.89663866, 0.89327731]),
 'split2_test_score': array([0.91260504, 0.91260504, 0.91092437, 0.91176471]),
 'split3_test_score': array([0.90588235, 0.90504202, 0.90588235, 0.9092437 ]),
 'split4_test_score': array([0.91337258, 0.91253154, 0.91253154, 0.91505467]),
 'mean_test_score': array([0.90721233, 0.9072121

I will feed the results back into my model. Results:
- best_params = {'cv__min_df': 8, 'cv__ngram_range': (1, 3)} for  'cv__ngram_range':[(1,1),(1,2),(1,3),(1,4),(2,2),(2,3),(2,4)], 'cv__min_df':np.arange(2,9)
- best_params = {'rf__max_depth': None, 'rf__n_estimators': 500} for params_rf = {
    'rf__max_depth':[None, 1,3,5],
    'rf__n_estimators':[100,500,700]
}
- best_params = {'cv__min_df': 7} for params_rf = {
    'cv__min_df':np.arange(7,11)
}

In [246]:
# Best model
pipe_rf = Pipeline([
    ('cv', CountVectorizer(stop_words=stop_words,ngram_range=(1,3),min_df=7,strip_accents = 'unicode')),
    ('rf', RandomForestClassifier(max_depth=None,n_estimators=500))
])
pipe_rf.fit(X_train, y_train_enc)
pipe_rf.score(X_train, y_train_enc), pipe_rf.score(X_test, y_test_enc)

(0.9384770549672213, 0.9077620967741935)

#### Extra Trees Pipeline

In [247]:
pipe_et = Pipeline([
    ('cv', CountVectorizer(stop_words=stop_words,ngram_range=(1,3),min_df=3,strip_accents = 'unicode')),
    ('et', ExtraTreesClassifier())
])
pipe_et.fit(X_train, y_train_enc)
pipe_et.score(X_train, y_train_enc), pipe_et.score(X_test, y_test_enc)

(0.9384770549672213, 0.9017137096774194)

In [128]:
pipe_et.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'cv', 'dt', 'cv__analyzer', 'cv__binary', 'cv__decode_error', 'cv__dtype', 'cv__encoding', 'cv__input', 'cv__lowercase', 'cv__max_df', 'cv__max_features', 'cv__min_df', 'cv__ngram_range', 'cv__preprocessor', 'cv__stop_words', 'cv__strip_accents', 'cv__token_pattern', 'cv__tokenizer', 'cv__vocabulary', 'dt__ccp_alpha', 'dt__class_weight', 'dt__criterion', 'dt__max_depth', 'dt__max_features', 'dt__max_leaf_nodes', 'dt__min_impurity_decrease', 'dt__min_samples_leaf', 'dt__min_samples_split', 'dt__min_weight_fraction_leaf', 'dt__random_state', 'dt__splitter'])

In [263]:
params_et = {
    'cv__ngram_range':[(1,1),(1,2),(1,3),(1,4),(2,2),(2,3),(2,4)],
    'cv__min_df':np.arange(2,9)
}

In [264]:
gs_et = GridSearchCV(pipe_et, param_grid = params_et, n_jobs=-1)

In [265]:
gs_et.fit(X_train,y_train_enc);

In [266]:
gs_et.best_params_

{'cv__min_df': 7, 'cv__ngram_range': (1, 4)}

In [267]:
gs_et.best_score_

0.9070441229477494

In [268]:
gs_et.cv_results_

{'mean_fit_time': array([10.23969059, 28.04750681, 51.94104457, 62.28181939, 37.29878163,
        42.29960465, 42.65438409, 17.41498642, 24.26708655, 26.56133804,
        25.95475655, 15.50171137, 17.56963019, 19.58457317, 12.0753406 ,
        15.81823492, 17.9631947 , 20.56166101, 12.23987908, 14.298487  ,
        16.60298271, 11.35798039, 15.02687716, 17.09854264, 19.34913197,
        11.03969698, 12.86578293, 15.22943788, 11.40297518, 14.10174208,
        16.13020458, 18.1261714 , 10.18930697, 12.12218704, 14.55101056,
        11.29669328, 13.61789374, 15.63512831, 18.41371427,  9.9675674 ,
        11.9394774 , 13.95401711, 10.53455081, 12.7180274 , 14.92627234,
        17.09014983,  9.50356145, 11.85960536, 10.81490626]),
 'std_fit_time': array([0.08860364, 5.01563102, 6.46917017, 3.19543706, 3.86408846,
        0.71740015, 3.28490861, 0.5015007 , 0.55071845, 0.83199271,
        1.80934548, 0.348318  , 0.29094769, 0.25926242, 0.12637182,
        0.29588728, 0.08362958, 0.24833954, 

I will feed the results back into my model. Results:
- best_params = {'et__max_features': 'auto'} params_et = {
    'et__max_features':['auto', 'sqrt','log2']
}
- best_params = {'cv__min_df': 7, 'cv__ngram_range': (1, 4)} for params_et = {
    'cv__ngram_range':[(1,1),(1,2),(1,3),(1,4),(2,2),(2,3),(2,4)],
    'cv__min_df':np.arange(2,9)
}

In [None]:
# Best Model
pipe_et = Pipeline([
    ('cv', CountVectorizer(stop_words=stop_words,ngram_range=(1,4),min_df=7)),
    ('et', ExtraTreesClassifier())
])
pipe_et.fit(X_train, y_train_enc)
pipe_et.score(X_train, y_train_enc), pipe_et.score(X_test, y_test_enc)

(0.9384770549672213, 0.905241935483871)

#### Multinomial Naive Bayes Pipeline

In [280]:
pipe_mnb = Pipeline([
    ('cv', CountVectorizer(stop_words=stop_words,ngram_range=(1,3),min_df=3,strip_accents = 'unicode')),
    ('mnb', MultinomialNB())
])
pipe_mnb.fit(X_train, y_train_enc)
pipe_mnb.score(X_train, y_train_enc), pipe_mnb.score(X_test, y_test_enc)

(0.8813245923684653, 0.8805443548387096)

In [281]:
pipe_mnb.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'cv', 'mnb', 'cv__analyzer', 'cv__binary', 'cv__decode_error', 'cv__dtype', 'cv__encoding', 'cv__input', 'cv__lowercase', 'cv__max_df', 'cv__max_features', 'cv__min_df', 'cv__ngram_range', 'cv__preprocessor', 'cv__stop_words', 'cv__strip_accents', 'cv__token_pattern', 'cv__tokenizer', 'cv__vocabulary', 'mnb__alpha', 'mnb__class_prior', 'mnb__fit_prior'])

In [289]:
params_mnb = {
    'mnb__alpha':[1/50,1/20,1/10]
}

In [290]:
gs_mnb = GridSearchCV(pipe_mnb, param_grid = params_mnb, n_jobs=-1)

In [291]:
gs_mnb.fit(X_train,y_train_enc);

In [292]:
gs_mnb.best_params_

{'mnb__alpha': 0.1}

In [293]:
gs_mnb.best_score_

0.8727499275572297

In [294]:
gs_mnb.cv_results_

{'mean_fit_time': array([8.05634322, 7.59803061, 6.75429106]),
 'std_fit_time': array([0.11787016, 0.71427664, 0.12732171]),
 'mean_score_time': array([1.30451722, 1.39987345, 1.39128919]),
 'std_score_time': array([0.04749604, 0.08632521, 0.06455451]),
 'param_mnb__alpha': masked_array(data=[0.02, 0.05, 0.1],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'mnb__alpha': 0.02}, {'mnb__alpha': 0.05}, {'mnb__alpha': 0.1}],
 'split0_test_score': array([0.8697479 , 0.87142857, 0.87142857]),
 'split1_test_score': array([0.87226891, 0.87310924, 0.87310924]),
 'split2_test_score': array([0.87563025, 0.87563025, 0.87478992]),
 'split3_test_score': array([0.88235294, 0.88235294, 0.88235294]),
 'split4_test_score': array([0.86122792, 0.86122792, 0.86206897]),
 'mean_test_score': array([0.87224558, 0.87274979, 0.87274993]),
 'std_test_score': array([0.00694611, 0.00685778, 0.00651671]),
 'rank_test_score': array([3, 2, 1], dtype=int32)}

I will feed the results back into my model. Results:
- best_params = {'mnb__alpha': 0.1} for params_mnb = {
    'mnb__alpha':[1/10,1/2,1,2,5,10]
}
- best_params = {'mnb__alpha': 0.1} for params_mnb = {
    'mnb__alpha':[1/50,1/20,1/10]
}

In [288]:
# Best model
pipe_mnb = Pipeline([
    ('cv', CountVectorizer(stop_words=stop_words,ngram_range=(1,3),min_df=3,strip_accents = 'unicode')),
    ('mnb', MultinomialNB(alpha=1/10))
])
pipe_mnb.fit(X_train, y_train_enc)
pipe_mnb.score(X_train, y_train_enc), pipe_mnb.score(X_test, y_test_enc)

(0.8838460245419398, 0.8810483870967742)

In [301]:
pipevc1 = VotingClassifier([pipe_lr,pipe_dt])

TypeError: __init__() takes 2 positional arguments but 3 were given

In [298]:
vc1.fit(X_train,y_train_enc)

TypeError: argument of type 'CountVectorizer' is not iterable