In [252]:
import numpy as np
import pandas as pd
import glob
import os

from sklearn.model_selection import (train_test_split, cross_val_predict, 
cross_val_score)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaggingRegressor, StackingRegressor, StackingClassifier
from sklearn.model_selection import RandomizedSearchCV

#NLP Libs
import re
import spacy
from spacy import displacy
from cleantext import clean
import xgboost

# Lemmatizing Libs
import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
from nltk.corpus import stopwords
nltk.download('stopwords');



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/william/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


###### Import, cleaning, traint test split

In [2]:
list_of_files = glob.glob('./scitech_data_scraped/*.csv') #
latest_file = max(list_of_files, key=os.path.getctime)

In [3]:
sci_tech_data = pd.read_csv(latest_file)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
# Drop duplicate rows.
sci_tech_data.drop_duplicates(subset = ['selftext','title'], inplace=True)

In [153]:
# Select rows with tech or science as the subreddits and save the feature space, 
# 'title' and the target, 'subreddit' to respective variables.

subreddit = sci_tech_data.loc[(sci_tech_data['subreddit'].isin(
                                ['technology','science']), 'subreddit')]
title = sci_tech_data.loc[(sci_tech_data['subreddit'].isin(
                                ['technology','science']),'title')]
# Set X and y
X = title
y = subreddit

In [156]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, 
                                                    stratify=y)
# Re-index to allow for easy splicing
(X_train.index, y_train.index, X_test.index, y_test.index) = \
(range(0, X_train.shape[0]),range(0, X_train.shape[0]), \
 range(0, X_test.shape[0]),range(0, X_test.shape[0]))

[remove stop words](https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe)

In [146]:
stop_words = stopwords.words('english') + ['technology', 'science']
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [145]:
def filtered(data):
    # replace "-" w/ 'hyphen', then remove emojis, punctuation, digits, and urls
    # while keeping an indicator or url usage.
    data_filtered = data.map(lambda x: clean(
                            re.sub('-',repl=' hyphen ', string = x) , no_emoji=True,
                                no_punct=True,no_digits=True, no_urls=True))

    # remove leftovers from the 'clean' function  

    data_filtered = data_filtered.map(lambda x: re.sub('(0|\|)',
                                                    repl='',string = x))
    #replacing hyphen with '-'
    data_filtered = data_filtered.map(lambda x: re.sub('hyphen',repl='-', string = x))
    # remove stop words   
    data_filtered = data_filtered.map(lambda x: ' '.join([word for word 
                                              in x.split() if word not in 
                                              stop_words]))
    return data_filtered

In [160]:
X_train_filtered = filtered(X_train)
X_test_filtered = filtered(X_test)

[how to get latest file](https://stackoverflow.com/questions/39327032/how-to-get-the-latest-file-in-a-folder)

In [10]:
X_train.shape

(7392,)

I'll get POS counts on the raw data.
[dict to df](https://sparkbyexamples.com/python/pandas-convert-list-of-dictionaries-to-dataframe/#:~:text=The%20from_records()%20method%20is,dicts%20%2C%20or%20from%20another%20DataFrame.); 
[spacy POS tagging](https://machinelearningknowledge.ai/tutorial-on-spacy-part-of-speech-pos-tagging/); [spacy POS tagging #2](https://www.geeksforgeeks.org/python-pos-tagging-and-lemmatization-using-spacy/)

In [11]:
X_train[0].split(" ")

['Proving',
 'a',
 'point:',
 '+|-',
 'karma',
 'has',
 'nothing',
 'to',
 'do',
 'with',
 'quality',
 'of',
 'post']

###### statistics

In [15]:
nlp = spacy.load('en_core_web_sm')

In [16]:
def tokenize(titles):
    '''tokenize a 1d object containing strings, returning a list of lists of 
    tokenized words'''
    
    tokenized = []
    
    for title in titles:
        tokenized.append(nlp(title))
        
    return tokenized

In [17]:
X_train_tokens = tokenize(X_train)
X_test_tokens = tokenize(X_test)

In [18]:
def get_pos_features(tokenized):
    '''get the stats features that will be used in predicting, returned as a 
    sorted dictionary with keys as spacy parts of speech and fine pos tags'''
    features = []
    for tokens in tokenized:
        for token in tokens:
            if token.pos_ not in features:
                features.append(token.pos_)
            if token.tag_ not in features:
                features.append(token.tag_)
    return dict(zip(sorted(features),np.zeros(len(features)).astype(int)))

In [19]:
pos_features = get_pos_features(X_train_tokens)
pos_features

{'$': 0,
 "''": 0,
 ',': 0,
 '-LRB-': 0,
 '-RRB-': 0,
 '.': 0,
 ':': 0,
 'ADD': 0,
 'ADJ': 0,
 'ADP': 0,
 'ADV': 0,
 'AFX': 0,
 'AUX': 0,
 'CC': 0,
 'CCONJ': 0,
 'CD': 0,
 'DET': 0,
 'DT': 0,
 'EX': 0,
 'FW': 0,
 'HYPH': 0,
 'IN': 0,
 'INTJ': 0,
 'JJ': 0,
 'JJR': 0,
 'JJS': 0,
 'LS': 0,
 'MD': 0,
 'NFP': 0,
 'NN': 0,
 'NNP': 0,
 'NNPS': 0,
 'NNS': 0,
 'NOUN': 0,
 'NUM': 0,
 'PART': 0,
 'PDT': 0,
 'POS': 0,
 'PRON': 0,
 'PROPN': 0,
 'PRP': 0,
 'PRP$': 0,
 'PUNCT': 0,
 'RB': 0,
 'RBR': 0,
 'RBS': 0,
 'RP': 0,
 'SCONJ': 0,
 'SPACE': 0,
 'SYM': 0,
 'TO': 0,
 'UH': 0,
 'VB': 0,
 'VBD': 0,
 'VBG': 0,
 'VBN': 0,
 'VBP': 0,
 'VBZ': 0,
 'VERB': 0,
 'WDT': 0,
 'WP': 0,
 'WP$': 0,
 'WRB': 0,
 'X': 0,
 'XX': 0,
 '_SP': 0,
 '``': 0}

In [102]:
ss = StandardScaler()
logreg = LogisticRegressionCV(max_iter=10_000)
pipe_lr = Pipeline([
    ('ss',ss),
    ('lr',logreg)
])

In [104]:
pipe_lr.fit(X_train_stats,y_train)

In [106]:
pipe_lr.score(X_train_stats,y_train), pipe_lr.score(X_test_stats,y_test)

(0.7955898268398268, 0.797971602434077)

In [113]:
pipe_lr['lr'].predict_proba(X_test_stats)



array([[1.00000000e+000, 0.00000000e+000],
       [1.00000000e+000, 1.71218293e-116],
       [1.00000000e+000, 7.53822428e-130],
       ...,
       [1.00000000e+000, 1.69389133e-106],
       [1.00000000e+000, 2.62290047e-141],
       [1.00000000e+000, 9.93644186e-133]])

In [98]:
logreg.fit(X_train_stats,y_train)

In [83]:
def get_title_stats(titles, tokenized, pos_feat_dict):
    features = pos_feat_dict
    '''generate title statistics from a 1d object of title and return as a 
    DataFrame '''
    
    # Char length, word count, max and avg word length
    title_stats = pd.DataFrame()
    title_stats['title_length'] = [len(titles[i]) for i in range(len(titles))]
    title_stats['title_word_count'] = [len(titles[i].split(' ')) for \
                                        i in range(len(titles))]
    title_stats['max_word_length'] = [max(map(len, title.split(' '))) \
                         for title in titles]
    title_stats['avg_word_length'] = title_stats['title_length']/title_stats['title_word_count']
    
    # POS counts, fine pos tag counts    
    parts_of_speech = []
    for tokens in tokenized:
        pos_counts = dict(features)
        for token in tokens:
            pos_counts[token.pos_] += 1
            pos_counts[token.tag_] += 1
        parts_of_speech.append(pos_counts)
    # combine 
    pos_df = pd.DataFrame(parts_of_speech).fillna(0)
    # pos_props_df = pos_df.iloc[:,4:].div(title_stats.title_word_count,axis=0).add_prefix("prop_")
    
    tstats_pos = pd.concat([title_stats, pos_df],axis = 1)
    return  tstats_pos

###### **Lemmatization** -- [sources](https://www.machinelearningplus.com/nlp/lemmatization-examples-python/)

In [140]:
lemmatizer = WordNetLemmatizer()
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/william/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

[Lemmatizing w/ POS](https://www.machinelearningplus.com/nlp/lemmatization-examples-python/)

In [141]:
# Lemmatize with POS Tag

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

To-do/idea: get POS counts [source](https://stackoverflow.com/questions/20960777/python-how-to-count-pos-tags-from-from-a-sentence)

In [163]:
lemmatizer = WordNetLemmatizer()
# Lemmatize train
X_train_lemmatized = []
for title in X_train_filtered:
    X_train_lemmatized.append(' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for \
                               w in nltk.word_tokenize(title)]))
X_train_lemmatized = pd.Series(data = X_train_lemmatized, index = X_train_filtered.index)
X_train_lemmatized

# Lemmatize test
X_test_lemmatized = []
for title in X_test_filtered:
    X_test_lemmatized.append(' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for \
                               w in nltk.word_tokenize(title)]))
X_test_lemmatized = pd.Series(data = X_test_lemmatized, index = X_test_filtered.index)


0              prove point + - karma nothing quality post
1        researcher identify origin serious illness child
2               way go level human need - think ai take u
3       former amazon exec reportedly paid $ run jeff ...
4       iphone pro max gb alpine green - unlocked rene...
                              ...                        
7387    supergps accurately pinpoint position within i...
7388                 email scrap legal - resistancephlcom
7389    amazon ceo prime video attractive economics pa...
7390               possible native payment system twitter
7391    new study base foi request page found coca - c...
Length: 7392, dtype: object

In [166]:
tdif

'traffic related air pollution associate increase likelihood multiple long - term physical mental health condition research people found simple measure reduce traffic level could potentially improve life lessen pressure healthcare system'

##### EDA

Below I will inspect various distributions of title statistics after adding the statistics to the dataframe.

In [None]:
# DataFrame from the lemmatized series
df = pd.DataFrame(X_train_lemmatized, columns = ['title']).join(pd.DataFrame(y_train))

In [None]:
df_test = pd.DataFrame(X_test_lemmatized, columns = ['title']).join(pd.DataFrame(y_train))

In [None]:
df.index = range(df.shape[0])

In [None]:
df['title_length'] = [len(df.loc[i,'title']) for i in range(len(df['title']))]

In [None]:
df['title_word_count'] = [len(df.loc[i,'title'].split(' ')) for i in range(len(df['title']))]

In [None]:
df['max_word_length'] = [max(map(len, title.split(' '))) \
                         for title in df['title']] 

In [None]:
df['avg_word_length'] = df['title_length']/df['title_word_count']

In [None]:
df['subreddit_indicator']=[1 if sub == 'technology' else 0 for sub in df['subreddit']]

In [None]:
import seaborn as sns

###### Visualizations

In [None]:
sns.histplot(data = df, x = 'title_word_count', hue = 'subreddit', stat='density', common_norm = False, bins = 45).set(title = 'Distribution of title word counts by subreddit')

Above we can see that shorter titles have a better chance of having been posted to technology, while longer titles are more likely from science even after accounting for baseline frequencies.

Above we see that there are many posts between 0 and 20 words long with a 

In [None]:
sns.histplot(data = df, x = 'title_length', hue = 'subreddit', stat='density', common_norm = False).set(title = 'Distribution of title character length by subreddit')

Above we see that generally title character lengths under around 80 are more likely to have been from technology, and above 80 from science.

In [None]:
sns.histplot(data = df, x = 'avg_word_length', hue = 'subreddit', 
             stat='density', bins = 100, common_norm = False).set(
    title = 'Distribution of title word counts by subreddit', xlim = [4,11])

[adjusting bins](https://stackoverflow.com/questions/48990594/how-to-draw-distribution-plot-for-discrete-variables-in-seaborn)

In [None]:
sns.histplot(data = df, x = 'max_word_length', hue = 'subreddit', 
             stat='density', common_norm = False, bins=np.arange(0,21)).set(
    title = 'Distribution of title word counts by subreddit', xlim = [0,20], 
    xticks = range(0,21));

In [None]:
df.corr()

Above we see that word length, character count, and word counts all have some correlation with the particular subredit, though the correlation is not strong.  However, from the density plots and the clear separation in likelihoods it is apparent that valuable information would likely be picked up from a tree-based classification model.  It is promising that the title statistics gathered thus far are not fully correlated with each other as this means they can provide non-redundant information to the modeling process.

I'll next look at word count frequencies

In [None]:
cv = CountVectorizer()
vectors = cv.fit_transform(df[df['subreddit']=='technology']['title'])
vectors.A
wc_vec = pd.DataFrame(vectors.A, columns = cv.get_feature_names_out())
wc_vec.sum().sort_values(ascending = False)[0:15].plot(kind = 'bar')\
.set(title = '15 most common words word count -- technology')

In [None]:
cv = CountVectorizer()
vectors = cv.fit_transform(df[df['subreddit']=='science']['title'])
vectors.A
wc_vec = pd.DataFrame(vectors.A, columns = cv.get_feature_names_out())
wc_vec.sum().sort_values(ascending = False)[0:15].plot(kind = 'bar')\
.set(title = '15 most common words word count -- science')

Above we see that there is not much overlap within the 15 most common words.  This indicates that there is a good chance that these common words will help with distinguishing between the subreddits.  Of note is that proper nouns seem to feature heavily in technology and almost not at all in science.

##### **Modeling**

The idea is to use a stacked model with one model using the vecotrized/lemmatized titles and the other using the title statistics as the features.

###### Iterative improvement ideas

Idea: combine these fuctionalities around the text data into a class with methods.
Idea: a class that grid search optimizes functions in the background.

###### Baseline

In [126]:
dc = DummyClassifier()
dc.fit(X_train,y_train)
dc.score(X_test,y_test)

0.6791075050709939

The baseline model accuracy is 67.9%

###### binary target encoding

In [291]:
# binary format for XGB target
y_train_binary = [1 if sub == 'technology' else 0 for sub in y_train]
y_test_binary = [1 if sub == 'technology' else 0 for sub in y_test]

###### modeling on statistics of the titles

In [318]:
X_train_stats = get_title_stats(X_train,X_train_tokens,pos_features)
X_test_stats = get_title_stats(X_test,X_test_tokens,pos_features)

In [319]:
xgb = xgboost.XGBClassifier()
xgb.fit(X_train_stats, y_train_binary)

In [320]:
xgb.score(X_train_stats, y_train_binary), xgb.score(X_test_stats, y_test_binary)

(0.9296536796536796, 0.8068965517241379)

The initial model of the score indicates substantial overfitting, so I will attempt to use random search to optimize the parameters max_depth, min_child_weight, gamma, subsample and colsample_bytree. [Control overfitting](https://xgboost.readthedocs.io/en/stable/tutorials/param_tuning.html)

In [269]:
np.linspace(1,6, num = 6).astype(int)

array([1, 2, 3, 4, 5, 6])

[random search example](https://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html#sphx-glr-auto-examples-model-selection-plot-randomized-search-py)

In [284]:

def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results["rank_test_score"] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print(
                "Mean validation score: {0:.3f} (std: {1:.3f})".format(
                    results["mean_test_score"][candidate],
                    results["std_test_score"][candidate],
                )
            )
            print("Parameters: {0}".format(results["params"][candidate]))
            print("")

searching with bayesian [hyperopt](https://grabngoinfo.com/hyperparameter-tuning-for-xgboost-grid-search-vs-random-search-vs-bayesian-optimization/) is a possible further exploration

In [311]:
xgb = xgboost.XGBClassifier()
param_dist = {
              # 'booster': ['gbtree'],
              'max_depth': np.linspace(1,6, num = 6).astype(int),
              # 'min_child_weight': np.logspace(.01, 1, num = 3),
              # 'gamma': np.logspace(.01, 1, num = 5),
              # 'subsample': np.linspace(.5,1,num=4),
              'colsample_bytree': np.linspace(.1,1,num=4)
              }
n_iter_search = 15

In [312]:
random_search = RandomizedSearchCV(
    xgb, param_distributions=param_dist, n_iter=n_iter_search, n_jobs=-1
)

In [None]:
report(random_search)

In [313]:
random_search.fit(X_train_stats,y_train_binary)

In [314]:
report(random_search.cv_results_)

Model with rank: 1
Mean validation score: 0.801 (std: 0.009)
Parameters: {'max_depth': 3, 'colsample_bytree': 0.7}

Model with rank: 2
Mean validation score: 0.800 (std: 0.008)
Parameters: {'max_depth': 4, 'colsample_bytree': 1.0}

Model with rank: 3
Mean validation score: 0.799 (std: 0.010)
Parameters: {'max_depth': 4, 'colsample_bytree': 0.4}



In [316]:
random_search.score(X_train_stats,y_train_binary),random_search.score(X_test_stats,y_test_binary)

(0.832521645021645, 0.8004056795131845)

I have been unable to improve test performance through tuning the xgboost parameters.

###### Modeling on the titles

In [139]:
xgb_c_titles = xgb.XGBClassifier()

In [None]:
xgb_c_titles.fit(X)

In [201]:
tfidf_vec = TfidfVectorizer(stop_words='english', max_features=1_000,
                             ngram_range=(1,2))
logreg = LogisticRegression(penalty='elasticnet', max_iter=10_000, \
                            random_state=1)

In [202]:
tfidf_lr_params = {"logreg__C": [.1,1,10],
                    "logreg__l1_ratio": [.1,.5,.9],
                   'tfidf_vec__max_df': [.9,.95,1.0],
                   'tfidf_vec__min_df': [.001,.003]}

In [203]:

tfidf_lr_pipe = Pipeline([
    ('tfidf_vec', tfidf_vec),
    ('logreg', LogisticRegression(penalty='none', max_iter=10_000, random_state=33))
])

In [204]:
tfidf_lr_pipe.fit(X_train_lemmatized, y_train_xgb)

In [205]:
tfidf_lr_pipe.score(X_test_lemmatized, y_test_xgb)

0.8567951318458418

In [None]:
gs_tfidf_lr = GridSearchCV(tfidf_lr_pipe,tfidf_lr_params)

In [None]:
gs_tfidf_lr.score(X_test_lemmatized,y_test)

In [None]:
tfidf_xgb_pipe = Pipeline([
    ('tfidf_vec', tfidf_vec),
    ('xgb', xgb.XGBClassifier())])
tfidf_xgb_params = {"xgb__eta": [.01,.03],
                    'xgb__booster': ['gbtree','gblinear'],
                   'tfidf_vec__max_df': [.9,.95,1.0],
                   'tfidf_vec__min_df': [.001,.003]}
gs_tfidf_xgb = GridSearchCV(tfidf_xgb_pipe,tfidf_xgb_params)

In [None]:
gs_tfidf_xgb.score(X_train_lemmatized,y_train_xgb),gs_tfidf_xgb.score(X_test_lemmatized,y_test_xgb)

In [None]:
gs_tfidf_xgb = GridSearchCV(tfidf_xgb_pipe,{'tfidf_vec__max_df': [0.9],
 'tfidf_vec__min_df': [0.001],
 'xgb__booster': ['gblinear'],
 'xgb__eta': [0.01]})

 

In [91]:
[1 if p =='technology' else 0 for p in rfc.predict(X_test_stats)]

[0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,


In [224]:
# gs_tfidf_xgb.fit(X_train_lemmatized,y_train_xgb)
gs_tfidf_xgb.score(X_train_lemmatized,y_train_xgb),gs_tfidf_xgb.score(X_test_lemmatized,y_test_xgb)

(0.9119318181818182, 0.8884381338742393)

In [225]:
gs_tfidf_xgb.best_params_

{'tfidf_vec__max_df': 0.925,
 'tfidf_vec__min_df': 0.001,
 'xgb__booster': 'gblinear',
 'xgb__eta': 0.01}

param choices:
tfidf_xgb_params = {"xgb__eta": [.01,.03],
                    'xgb__booster': ['gbtree','gblinear'],
                   'tfidf_vec__max_df': [.9,.95,1.0],
                   'tfidf_vec__min_df': [.001,.003]}
score: 

(0.9143668831168831, 0.8892494929006085)

gs_tfidf_xgb.best_params_ :

{'tfidf_vec__max_df': 0.9,
 'tfidf_vec__min_df': 0.001,
 'xgb__booster': 'gblinear',
 'xgb__eta': 0.01}
 

So far xgboost has provided the best prediction on the test set.  I will try to narrow in on the best parameters.

In [206]:
tfidf_xgb_pipe = Pipeline([
    ('tfidf_vec', tfidf_vec),
    ('xgb', xgb.XGBClassifier(seed = 1))])
tfidf_xgb_params = {"xgb__eta": [.01,.015,.05],
                    'xgb__booster': ['gbtree','gblinear'],
                    # 'xgb__lambda': [.1,1,10],
                    # 'xgb__alpha': [0,.1,1],
                   'tfidf_vec__max_df': [.85,.9,.925],
                   'tfidf_vec__min_df': [.0005,.001,.0015],}
gs_tfidf_xgb = GridSearchCV(tfidf_xgb_pipe,tfidf_xgb_params)

In [207]:
gs_tfidf_xgb.fit(X_train_lemmatized,y_train_xgb)

In [226]:
gs_tfidf_xgb.best_params_

{'tfidf_vec__max_df': 0.925,
 'tfidf_vec__min_df': 0.001,
 'xgb__booster': 'gblinear',
 'xgb__eta': 0.01}

gs_tfidf_xgb.best_params_
{'tfidf_vec__max_df': 0.85,
 'tfidf_vec__min_df': 0.001,
 'xgb__alpha': 0,
 'xgb__booster': 'gbtree',
 'xgb__eta': 0.05,
 'xgb__lambda': 0.1}


In [208]:
gs_tfidf_xgb.score(X_train_lemmatized,y_train_xgb),gs_tfidf_xgb.score(X_test_lemmatized,y_test_xgb)

(0.9119318181818182, 0.8884381338742393)

In [None]:
gs_tfidf_xgb = GridSearchCV(tfidf_xgb_pipe,{'tfidf_vec__max_df': [0.9],
 'tfidf_vec__min_df': [0.001],
 'xgb__booster': ['gblinear'],
 'xgb__eta': [0.005]})

In [None]:
# gs_tfidf_xgb.fit(X_train_lemmatized,y_train_xgb)
gs_tfidf_xgb.score(X_train_lemmatized,y_train_xgb),gs_tfidf_xgb.score(X_test_lemmatized,y_test_xgb)

In [323]:
# pass in params from tuned xgb, tfidf

# {'tfidf_vec__max_df': 0.925,
#  'tfidf_vec__min_df': 0.001,
#  'xgb__booster': 'gblinear',
#  'xgb__eta': 0.01}

# Instantiate Vectorizer with previously grid search optimized parameters
tfidf_vec = TfidfVectorizer(stop_words='english', max_features=1_000,
                             ngram_range=(1,2), max_df= 0.925, min_df= 0.001)

title_pipe = Pipeline([
    ('tfidf_vec', tfidf_vec),
    ('xgb', xgboost.XGBClassifier(booster= 'gblinear',eta= 0.01))])

title_pipe.fit(X_train_lemmatized, y_train_xgb)
title_pipe.score(X_train_lemmatized, y_train_xgb), title_pipe.score(X_test_lemmatized, y_test_xgb)

(0.9119318181818182, 0.8884381338742393)

Above, we see that the titles have more predictive value alone than do the statistics. 

###### stacked models

In [332]:
# Fit models
xgb_stats = xgboost.XGBClassifier()

# Instantiate pipes
# stats
xgb_stats.fit(X_train_stats, y_train_binary)

# title
title_pipe.fit(X_train_lemmatized, y_train_binary)

In [333]:
X_pred_train = pd.DataFrame()
X_pred_train['stats'] = cross_val_predict(xgb_stats, X_train_stats, y_train_binary)
X_pred_train['title'] = cross_val_predict(title_pipe, X_train_lemmatized, y_train_binary)

In [184]:
# Level 2 model: Logistic Regression
lev2_logreg = LogisticRegressionCV()

In [334]:
lev2_logreg.fit(X_pred_train, y_train_xgb)

In [335]:
lev2_logreg.score(X_pred_train,y_train_xgb)

0.8432088744588745

Conclusion: this form of stacking as led to worse results than seen with just the model fitted on the title data.  I will next attempt to use a column transformer to combine the title and stats data into one dataset.

[source: strip characters from string in series](https://stackoverflow.com/questions/13682044/remove-unwanted-parts-from-strings-in-a-column)
[source: remove punctuation](https://www.google.com/search?q=how+to+replace+punctuation+with+regular+expression+python&rlz=1C5CHFA_enUS983US983&oq=how+to+replace+punctuation+with+regular&aqs=chrome.1.69i57j33i160l2.10574j0j7&sourceid=chrome&ie=UTF-8#kpvalbx=_7SabY4OANaSs0PEP042roAM_32)

###### Column transforming pipeline to generate final model

In [336]:
from sklearn.compose import ColumnTransformer

In [345]:
X_train_full = pd.concat([X_train_lemmatized.rename('title'), X_train_stats], axis = 1)
X_test_full = pd.concat([X_test_lemmatized.rename('title'), X_test_stats], axis = 1)

In [358]:
X_train_full

Unnamed: 0,title,title_length,title_word_count,max_word_length,avg_word_length,$,'',",",-LRB-,-RRB-,...,VBZ,VERB,WDT,WP,WP$,WRB,X,XX,_SP,``
0,prove point + - karma nothing quality post,65,13,7,5.000000,0,0,0,0,0,...,1,3,0,0,0,0,1,0,0,0
1,researcher identify origin serious illness child,70,10,11,7.000000,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,way go level human need - think ai take u,95,20,8,4.750000,0,0,0,0,0,...,0,5,0,0,0,0,0,0,0,0
3,former amazon exec reportedly paid $ run jeff ...,153,26,10,5.884615,1,0,1,0,0,...,0,3,1,0,0,0,0,0,0,0
4,iphone pro max gb alpine green - unlocked rene...,67,11,8,6.090909,0,0,2,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7387,supergps accurately pinpoint position within i...,85,10,12,8.500000,0,1,0,0,0,...,1,1,0,0,0,0,0,0,0,1
7388,email scrap legal - resistancephlcom,44,6,17,7.333333,0,0,0,0,0,...,1,1,0,0,0,0,1,0,0,0
7389,amazon ceo prime video attractive economics pa...,68,10,10,6.800000,0,1,0,1,1,...,1,1,0,0,0,0,0,0,0,1
7390,possible native payment system twitter,42,6,8,7.000000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [373]:
from sklearn.preprocessing import PolynomialFeatures 

In [376]:
tfidf_vec = TfidfVectorizer(stop_words='english', max_features=1_000,ngram_range=(1,2))

text_transformer = tfidf_vec

numeric_transformer = Pipeline(
    steps = [('scaler',StandardScaler()),
             ('polynomial', PolynomialFeatures(degree=2, interaction_only=True))
            ]
)
text_features = 'title'
numeric_features = X_train_full.columns[1:].values

preprocessor = ColumnTransformer(
    transformers=[
    ('txt', text_transformer, text_features),
    ('num', numeric_transformer, numeric_features)
    ]
)

In [391]:
clf = Pipeline(
    steps = [('preprocessor', preprocessor), ('classifier', xgboost.XGBClassifier(max_depth=6))]
)
clf.fit (X_train_full, y_train_binary)

In [392]:
clf.score(X_train_full,y_train_binary clf.score(X_test_full,y_test_binary)

(0.9552218614718615, 0.8713995943204869)