In [1]:
import numpy as np
import pandas as pd
import glob
import os

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

#NLP Libs
import re
import spacy
from spacy import displacy
from cleantext import clean
import xgboost as xgb

# Lemmatizing Libs
import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
from nltk.corpus import stopwords
nltk.download('stopwords');



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/william/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[how to get latest file](https://stackoverflow.com/questions/39327032/how-to-get-the-latest-file-in-a-folder)

In [2]:
list_of_files = glob.glob('./scitech_data_scraped/*.csv') #
latest_file = max(list_of_files, key=os.path.getctime)

In [3]:
sci_tech_data = pd.read_csv(latest_file)

  exec(code_obj, self.user_global_ns, self.user_ns)


Below I drop duplicate rows.

In [4]:
sci_tech_data.drop_duplicates(subset = ['selftext','title'], inplace=True)

Below I select the relevent rows that have tech or science as the subreddits and save the feature space, 'title' and the target, 'subreddit' to respective variables.

In [5]:
sci_tech_data['subreddit_name_prefixed'].value_counts()

r/technology              6692
r/science                 3165
u/MEGA-Technology          193
u/Ok-Technology-1912       142
u/Candid-Science-7189      132
u/Gold-Science-2230         94
u/Away-Technology-4883      63
u/Any-Technology-9975       61
u/Dazzling-Science-652      34
u/AVID-Technology           30
u/Content-Technology-7      19
u/Bugd-Technology            7
u/kretoss-technology         5
u/SecureAge-Technology       4
u/Thin-Science-6996          3
u/OV-Technology              3
u/Slow-Technology-9949       3
u/Ok-Science-1826            2
u/FS-Technology              2
u/A-science-enthusiast       2
u/Efficient-Science-80       1
u/Ok-Science-9480            1
u/No-Technology-2687         1
u/Jaded-Technology-332       1
u/more-technology-00         1
u/Dear-Technology-6015       1
u/Then-Technology-9558       1
u/MR-Technology              1
u/Fearless-Science-103       1
u/Worth-Science-4441         1
u/awesome-technology         1
u/visimens-technology        1
u/Fearle

In [6]:
subreddit = sci_tech_data.loc[(sci_tech_data['subreddit'].isin(
                                ['technology','science']), 'subreddit')]
title = sci_tech_data.loc[(sci_tech_data['subreddit'].isin(
                                ['technology','science']),'title')]

In [7]:
X = title
y = subreddit

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [9]:
(X_train.index, y_train.index, X_test.index, y_test.index) = \
(range(0, X_train.shape[0]),range(0, X_train.shape[0]), \
 range(0, X_test.shape[0]),range(0, X_test.shape[0]))

In [10]:
X_train.shape

(7392,)

I'll get POS counts on the raw data.
[dict to df](https://sparkbyexamples.com/python/pandas-convert-list-of-dictionaries-to-dataframe/#:~:text=The%20from_records()%20method%20is,dicts%20%2C%20or%20from%20another%20DataFrame.); 
[spacy POS tagging](https://machinelearningknowledge.ai/tutorial-on-spacy-part-of-speech-pos-tagging/); [spacy POS tagging #2](https://www.geeksforgeeks.org/python-pos-tagging-and-lemmatization-using-spacy/)

In [11]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

# Counting the frequencies of different fine-grained tags:
TAG_counts = doc.count_by(spacy.attrs.TAG)

print(TAG_counts)
for k,v in sorted(TAG_counts.items()):
    print(f'{k}. {doc.vocab[k].text:{4}}: {v}')

{15267657372422890137: 2, 10554686591937588953: 3, 15308085513773655218: 2, 17109001835818727656: 1, 1292078113972184607: 1, 74: 1, 164681854541413346: 1, 12646065887601541794: 1}
74. POS : 1
164681854541413346. RB  : 1
1292078113972184607. IN  : 1
10554686591937588953. JJ  : 3
12646065887601541794. .   : 1
15267657372422890137. DT  : 2
15308085513773655218. NN  : 2
17109001835818727656. VBD : 1


In [12]:
nlp = spacy.load("en_core_web_sm")
parts_of_speech = []

In [13]:
def pos_stats(array):
    parts_of_speech = []
    for title in array:
        tokenized = nlp(title)
        pos_counts = {}
        for token in tokenized:
            if token.pos_ not in pos_counts.keys():
                pos_counts[token.pos_] = 1
            else :
                pos_counts[token.pos_] += 1
        parts_of_speech.append(pos_counts)
    return pd.DataFrame(parts_of_speech).fillna(0)

In [14]:
from sklearn.dummy import DummyClassifier
dc = DummyClassifier()
dc.fit(X_train,y_train)
dc.score(X_train,y_train)

0.6788419913419913

In [15]:
X_train[0].split(" ")

['Proving',
 'a',
 'point:',
 '+|-',
 'karma',
 'has',
 'nothing',
 'to',
 'do',
 'with',
 'quality',
 'of',
 'post']

In [16]:
def title_stats(titles):
    '''generate title statistics from a 1d object of title and return as a 
    DataFrame '''
    # Char length, word count, max and avg word length
    title_stats = pd.DataFrame(titles, columns = ['title'])
    title_stats['title_length'] = [len(titles[i]) for i in range(len(titles))]
    title_stats['title_word_count'] = [len(titles[i].split(' ')) for \
                                        i in range(len(titles))]
    title_stats['max_word_length'] = [max(map(len, title.split(' '))) \
                         for title in titles]
    title_stats['avg_word_length'] = title_stats['title_length']/title_stats['title_word_count']
    
    # POS stats
    parts_of_speech = []
    
    for title in titles:
        tokenized = nlp(title)
        pos_counts = {}
        for token in tokenized:
            if token.pos_ not in pos_counts.keys():
                pos_counts[token.pos_] = 1
            else :
                pos_counts[token.pos_] += 1
        parts_of_speech.append(pos_counts)
    title_stats = title_stats.join(pd.DataFrame(parts_of_speech).fillna(0))
    return title_stats.iloc[:,1:]

In [48]:
def tokenize(titles):
    '''tokenize a 1d object containing strings, returning a list of lists of 
    tokenized words'''
    
    tokenized = []
    for title in titles:
        tokenized.append(nlp(title))
        
    return tokenized

In [56]:
X_train_tokens[0][0].pos_

'VERB'

In [18]:
def pos_features(titles, tokenized):
    'get the stats features that will be used in predicting'
    
    features = []
    for title in titles:
        tokenized = nlp(title)
        for token in tokenized:
            if token.pos_ not in features:
                features.append(token.pos_)
            if token.tag_ not in features:
                features.append(token.tag_)       
    return features

In [58]:
len(pos_features)

67

In [19]:
pos_features = pos_features(X_train)

In [72]:
pos_features

['VERB',
 'VBG',
 'DET',
 'DT',
 'NOUN',
 'NN',
 'PUNCT',
 ':',
 'X',
 'LS',
 'PROPN',
 'NNP',
 'VBZ',
 'PRON',
 'PART',
 'TO',
 'VB',
 'ADP',
 'IN',
 'NNS',
 'AUX',
 'VBP',
 'VBN',
 'ADJ',
 'JJ',
 '.',
 'MD',
 'PRP',
 'SCONJ',
 'RP',
 'VBD',
 'ADV',
 'RB',
 'JJR',
 'SYM',
 '$',
 'NUM',
 'CD',
 'POS',
 ',',
 'WDT',
 'HYPH',
 '-LRB-',
 '-RRB-',
 '``',
 "''",
 'CCONJ',
 'CC',
 'WRB',
 'INTJ',
 'UH',
 'XX',
 'WP',
 'PRP$',
 'JJS',
 'NNPS',
 'RBR',
 'EX',
 'RBS',
 'PDT',
 'FW',
 'ADD',
 'NFP',
 'SPACE',
 '_SP',
 'WP$',
 'AFX']

In [73]:
pos_feat_dict = dict(zip(pos_features,np.zeros(len(pos_features)).astype(int)))

In [74]:
pos_feat_dict

{'VERB': 0,
 'VBG': 0,
 'DET': 0,
 'DT': 0,
 'NOUN': 0,
 'NN': 0,
 'PUNCT': 0,
 ':': 0,
 'X': 0,
 'LS': 0,
 'PROPN': 0,
 'NNP': 0,
 'VBZ': 0,
 'PRON': 0,
 'PART': 0,
 'TO': 0,
 'VB': 0,
 'ADP': 0,
 'IN': 0,
 'NNS': 0,
 'AUX': 0,
 'VBP': 0,
 'VBN': 0,
 'ADJ': 0,
 'JJ': 0,
 '.': 0,
 'MD': 0,
 'PRP': 0,
 'SCONJ': 0,
 'RP': 0,
 'VBD': 0,
 'ADV': 0,
 'RB': 0,
 'JJR': 0,
 'SYM': 0,
 '$': 0,
 'NUM': 0,
 'CD': 0,
 'POS': 0,
 ',': 0,
 'WDT': 0,
 'HYPH': 0,
 '-LRB-': 0,
 '-RRB-': 0,
 '``': 0,
 "''": 0,
 'CCONJ': 0,
 'CC': 0,
 'WRB': 0,
 'INTJ': 0,
 'UH': 0,
 'XX': 0,
 'WP': 0,
 'PRP$': 0,
 'JJS': 0,
 'NNPS': 0,
 'RBR': 0,
 'EX': 0,
 'RBS': 0,
 'PDT': 0,
 'FW': 0,
 'ADD': 0,
 'NFP': 0,
 'SPACE': 0,
 '_SP': 0,
 'WP$': 0,
 'AFX': 0}

Get tokens for X_train, X_test

In [50]:
X_train_tokens = tokenize(X_train)

In [65]:
X_test_tokens = tokenize(X_test)

In [96]:
def title_stats(titles, tokenized, pos_feat_dict):
    '''generate title statistics from a 1d object of title and return as a 
    DataFrame '''
    # Char length, word count, max and avg word length
    title_stats = pd.DataFrame()
    title_stats['title_length'] = [len(titles[i]) for i in range(len(titles))]
    title_stats['title_word_count'] = [len(titles[i].split(' ')) for \
                                        i in range(len(titles))]
    title_stats['max_word_length'] = [max(map(len, title.split(' '))) \
                         for title in titles]
    title_stats['avg_word_length'] = title_stats['title_length']/title_stats['title_word_count']
    
    # POS counts, fine pos tag counts    
    parts_of_speech = []

    for tokens in tokenized:

        pos_counts = pos_feat_dict
        for token in tokens:
                pos_counts[token.pos_] += 1
                pos_counts[token.tag_] += 1
        parts_of_speech.append(pos_counts)
    # combine 
    pos_df = pd.DataFrame(parts_of_speech).fillna(0)
    tstats_pos = pd.concat([title_stats, pos_df],axis = 1)
    return  tstats_pos

In [None]:
title_stats()

In [68]:
pos_feat_dict  # why is pos_feat_dict mutating

{'VERB': 14914,
 'VBG': 2720,
 'DET': 6897,
 'DT': 7191,
 'NOUN': 30151,
 'NN': 21455,
 'PUNCT': 12951,
 ':': 1451,
 'X': 208,
 'LS': 11,
 'PROPN': 20461,
 'NNP': 19382,
 'VBZ': 4257,
 'PRON': 5366,
 'PART': 3395,
 'TO': 2005,
 'VB': 4637,
 'ADP': 12893,
 'IN': 13525,
 'NNS': 8927,
 'AUX': 5209,
 'VBP': 2638,
 'VBN': 2588,
 'ADJ': 9974,
 'JJ': 9055,
 '.': 3612,
 'MD': 1446,
 'PRP': 2545,
 'SCONJ': 1724,
 'RP': 383,
 'VBD': 1799,
 'ADV': 3302,
 'RB': 3386,
 'JJR': 571,
 'SYM': 797,
 '$': 351,
 'NUM': 3138,
 'CD': 3138,
 'POS': 943,
 ',': 3404,
 'WDT': 523,
 'HYPH': 1846,
 '-LRB-': 499,
 '-RRB-': 541,
 '``': 780,
 "''": 770,
 'CCONJ': 3003,
 'CC': 3003,
 'WRB': 728,
 'INTJ': 193,
 'UH': 195,
 'XX': 88,
 'WP': 488,
 'PRP$': 1148,
 'JJS': 346,
 'NNPS': 1083,
 'RBR': 280,
 'EX': 102,
 'RBS': 89,
 'PDT': 36,
 'FW': 69,
 'ADD': 40,
 'NFP': 49,
 'SPACE': 71,
 '_SP': 71,
 'WP$': 3,
 'AFX': 2}

In [75]:
pos_feat_dict

{'VERB': 0,
 'VBG': 0,
 'DET': 0,
 'DT': 0,
 'NOUN': 0,
 'NN': 0,
 'PUNCT': 0,
 ':': 0,
 'X': 0,
 'LS': 0,
 'PROPN': 0,
 'NNP': 0,
 'VBZ': 0,
 'PRON': 0,
 'PART': 0,
 'TO': 0,
 'VB': 0,
 'ADP': 0,
 'IN': 0,
 'NNS': 0,
 'AUX': 0,
 'VBP': 0,
 'VBN': 0,
 'ADJ': 0,
 'JJ': 0,
 '.': 0,
 'MD': 0,
 'PRP': 0,
 'SCONJ': 0,
 'RP': 0,
 'VBD': 0,
 'ADV': 0,
 'RB': 0,
 'JJR': 0,
 'SYM': 0,
 '$': 0,
 'NUM': 0,
 'CD': 0,
 'POS': 0,
 ',': 0,
 'WDT': 0,
 'HYPH': 0,
 '-LRB-': 0,
 '-RRB-': 0,
 '``': 0,
 "''": 0,
 'CCONJ': 0,
 'CC': 0,
 'WRB': 0,
 'INTJ': 0,
 'UH': 0,
 'XX': 0,
 'WP': 0,
 'PRP$': 0,
 'JJS': 0,
 'NNPS': 0,
 'RBR': 0,
 'EX': 0,
 'RBS': 0,
 'PDT': 0,
 'FW': 0,
 'ADD': 0,
 'NFP': 0,
 'SPACE': 0,
 '_SP': 0,
 'WP$': 0,
 'AFX': 0}

In [93]:
X_train_stats = title_stats(X_train, X_train_tokens, pos_feat_dict)

In [94]:
X_test_stats = title_stats(X_test, X_test_tokens, pos_feat_dict)

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [95]:
rfc = RandomForestClassifier()
rfc.fit(X_train_stats, y_train)

ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [88]:
X_train_stats.shape

(7392, 71)

In [85]:
X_test_stats.shape

(2465, 71)

In [35]:
X_test_fine_stats.shape

(2465, 22)

In [91]:
X_train_stats

Unnamed: 0,title_length,title_word_count,max_word_length,avg_word_length,VERB,VBG,DET,DT,NOUN,NN,...,EX,RBS,PDT,FW,ADD,NFP,SPACE,_SP,WP$,AFX
0,65,13,7,5.000000,14914,2720,6897,7191,30151,21455,...,102,89,36,69,40,49,71,71,3,2
1,70,10,11,7.000000,14914,2720,6897,7191,30151,21455,...,102,89,36,69,40,49,71,71,3,2
2,95,20,8,4.750000,14914,2720,6897,7191,30151,21455,...,102,89,36,69,40,49,71,71,3,2
3,153,26,10,5.884615,14914,2720,6897,7191,30151,21455,...,102,89,36,69,40,49,71,71,3,2
4,67,11,8,6.090909,14914,2720,6897,7191,30151,21455,...,102,89,36,69,40,49,71,71,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7387,85,10,12,8.500000,14914,2720,6897,7191,30151,21455,...,102,89,36,69,40,49,71,71,3,2
7388,44,6,17,7.333333,14914,2720,6897,7191,30151,21455,...,102,89,36,69,40,49,71,71,3,2
7389,68,10,10,6.800000,14914,2720,6897,7191,30151,21455,...,102,89,36,69,40,49,71,71,3,2
7390,42,6,8,7.000000,14914,2720,6897,7191,30151,21455,...,102,89,36,69,40,49,71,71,3,2


In [89]:
rfc.score(X_test_stats, y_test)

0.6924949290060852

In [None]:
rfc.fit(X_train_fine_stats, y_train)

In [None]:
set(X_train_fine_stats.columns) == set(X_test_fine_stats.columns)

In [None]:
len(X_test_fine_stats.columns)

In [None]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

# Counting the frequencies of different fine-grained tags:
TAG_counts = doc.count_by(spacy.attrs.TAG)

print(TAG_counts)
for k,v in sorted(TAG_counts.items()):
    print(f'{doc.vocab[k].text:{4}}: {v}')

In [None]:
TAG_counts = .count_by(spacy.attrs)

In [None]:
test_stats = title_stats(X_test)

In [None]:
stats = title_stats(X_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
params = 

In [None]:
rfc.fit(stats[['title_length', 'title_word_count', 'max_word_length',
       'avg_word_length', 'VERB', 'DET', 'NOUN', 'PUNCT', 'X', 'PROPN', 'PRON',
       'PART', 'ADP', 'AUX', 'ADJ', 'SCONJ', 'ADV', 'SYM', 'NUM', 'CCONJ',
       'INTJ', 'SPACE']],y_train)

In [None]:
stats.columns

In [None]:
rfc.score(test_stats[['title_length', 'title_word_count', 'max_word_length',
       'avg_word_length', 'VERB', 'DET', 'NOUN', 'PUNCT', 'X', 'PROPN', 'PRON',
       'PART', 'ADP', 'AUX', 'ADJ', 'SCONJ', 'ADV', 'SYM', 'NUM', 'CCONJ',
       'INTJ', 'SPACE']],y_test)

In [None]:
set(stats.columns) == set(test_stats.columns)

In [None]:
from sklearn.cla import Lo

In [None]:
rfc.score(stats,y_train), rfc.score(test_stats,y_test)

In [None]:
parts_of_speech_stats=pos_stats(X_train)

In [None]:
parts_of_speech_stats.join(ts)

In [None]:
ts =title_stats(X_train)

In [None]:
X_train_pos = pos_stats(X_train)

In [None]:
X_test_pos = pos_stats(X_test)

In [None]:
X_test_pos.shape

[source: strip characters from string in series](https://stackoverflow.com/questions/13682044/remove-unwanted-parts-from-strings-in-a-column)
[source: remove punctuation](https://www.google.com/search?q=how+to+replace+punctuation+with+regular+expression+python&rlz=1C5CHFA_enUS983US983&oq=how+to+replace+punctuation+with+regular&aqs=chrome.1.69i57j33i160l2.10574j0j7&sourceid=chrome&ie=UTF-8#kpvalbx=_7SabY4OANaSs0PEP042roAM_32)

Get rid of puctuation

[remove stop words](https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe)

In [None]:
stop_words = stopwords.words('english')
stop_words;

In [None]:
def filtered(data):
    # replace "-" w/ 'hyphen', then remove emojis, punctuation, digits, and urls
    # while keeping an indicator or url usage.
    data_filtered = data.map(lambda x: clean(
                            re.sub('-',repl=' hyphen ', string = x) , no_emoji=True,
                                no_punct=True,no_digits=True, no_urls=True))

    # remove leftovers from the 'clean' function  

    data_filtered = data_filtered.map(lambda x: re.sub('(0|\|)',
                                                    repl='',string = x))
    #replacing hyphen with '-'
    data_filtered = data_filtered.map(lambda x: re.sub('hyphen',repl='-', string = x))
    # remove stop words   
    data_filtered = data_filtered.map(lambda x: ' '.join([word for word 
                                              in x.split() if word not in 
                                              stop_words]))
    return data_filtered

In [None]:
X_test_filtered = filtered(X_test)

In [None]:
# replace "-" w/ 'hyphen', then remove emojis, punctuation, digits, and urls
# while keeping an indicator or url usage.

X_train_filtered = X_train.map(lambda x: clean(
                            re.sub('-',repl=' hyphen ', string = x) , no_emoji=True,
                                no_punct=True,no_digits=True, no_urls=True))

# remove leftovers from the 'clean' function  

X_train_filtered = X_train_filtered.map(lambda x: re.sub('(0|\|)',
                                                    repl='',string = x))
#replacing hyphen with '-'
X_train_filtered = X_train_filtered.map(lambda x: re.sub('hyphen',repl='-', string = x))
# remove stop words   
X_train_filtered = X_train_filtered.map(lambda x: ' '.join([word for word 
                                          in x.split() if word not in 
                                          stop_words]))

print((sum([len(s) for s in X_train])-sum([len(s) for s in X_train_filtered]))
    /sum([len(s) for s in X_train])
)

# I'll keep (<url>) group in the words, in case there is differential frequency

###### **Lemmatization** -- [sources](https://www.machinelearningplus.com/nlp/lemmatization-examples-python/)

In [None]:
lemmatizer = WordNetLemmatizer()
import nltk
nltk.download('averaged_perceptron_tagger')

[Lemmatizing w/ POS](https://www.machinelearningplus.com/nlp/lemmatization-examples-python/)

In [None]:
# Lemmatize with POS Tag

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

To-do/idea: get POS counts [source](https://stackoverflow.com/questions/20960777/python-how-to-count-pos-tags-from-from-a-sentence)

In [None]:
lemmatizer = WordNetLemmatizer()

X_train_lemmatized = []
for title in X_train_filtered:
    X_train_lemmatized.append(' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for \
                               w in nltk.word_tokenize(title)]))
X_train_lemmatized;

In [None]:
X_train_lemmatized = pd.Series(data = X_train_lemmatized, index = X_train_filtered.index)

In [None]:
X_test_lemmatized = []
for title in X_test_filtered:
    X_test_lemmatized.append(' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for \
                               w in nltk.word_tokenize(title)]))
X_test_lemmatized

In [None]:
X_test_lemmatized

In [None]:
# back to DataFrame
df = pd.DataFrame(X_train_lemmatized, columns = ['title']).join(pd.DataFrame(y_train))

In [None]:
df

In [None]:
df_test = pd.DataFrame(X_test_lemmatized, columns = ['title']).join(pd.DataFrame(y_train))

In [None]:
df.index = range(df.shape[0])

###### With words lemmatized and english stop words removed I will proceed with EDA

Below I will inspect various distributions of title statistics after adding the statistics to the dataframe.

In [None]:
df['title_length'] = [len(df.loc[i,'title']) for i in range(len(df['title']))]

In [None]:
df['title_word_count'] = [len(df.loc[i,'title'].split(' ')) for i in range(len(df['title']))]

In [None]:
df['max_word_length'] = [max(map(len, title.split(' '))) \
                         for title in df['title']] 

In [None]:
df['avg_word_length'] = df['title_length']/df['title_word_count']

In [None]:
df['subreddit_indicator']=[1 if sub == 'technology' else 0 for sub in df['subreddit']]

In [None]:
import seaborn as sns

###### Visualizations

In [None]:
sns.histplot(data = df, x = 'title_word_count', hue = 'subreddit', stat='density', common_norm = False, bins = 45).set(title = 'Distribution of title word counts by subreddit')

Above we can see that shorter titles have a better chance of having been posted to technology, while longer titles are more likely from science even after accounting for baseline frequencies.

Above we see that there are many posts between 0 and 20 words long with a 

In [None]:
sns.histplot(data = df, x = 'title_length', hue = 'subreddit', stat='density', common_norm = False).set(title = 'Distribution of title character length by subreddit')

Above we see that generally title character lengths under around 80 are more likely to have been from technology, and above 80 from science.

In [None]:
sns.histplot(data = df, x = 'avg_word_length', hue = 'subreddit', 
             stat='density', bins = 100, common_norm = False).set(
    title = 'Distribution of title word counts by subreddit', xlim = [4,11])

[adjusting bins](https://stackoverflow.com/questions/48990594/how-to-draw-distribution-plot-for-discrete-variables-in-seaborn)

In [None]:
sns.histplot(data = df, x = 'max_word_length', hue = 'subreddit', 
             stat='density', common_norm = False, bins=np.arange(0,21)).set(
    title = 'Distribution of title word counts by subreddit', xlim = [0,20], 
    xticks = range(0,21));

In [None]:
df.corr()

Above we see that word length, character count, and word counts all have some correlation with the particular subredit, though the correlation is not strong.  However, from the density plots and the clear separation in likelihoods it is apparent that valuable information would likely be picked up from a tree-based classification model.  It is promising that the title statistics gathered thus far are not fully correlated with each other as this means they can provide non-redundant information to the modeling process.

I'll next look at word count frequencies

In [None]:
cv = CountVectorizer()
vectors = cv.fit_transform(df[df['subreddit']=='technology']['title'])
vectors.A
wc_vec = pd.DataFrame(vectors.A, columns = cv.get_feature_names_out())
wc_vec.sum().sort_values(ascending = False)[0:15].plot(kind = 'bar')\
.set(title = '15 most common words word count -- technology')

In [None]:
cv = CountVectorizer()
vectors = cv.fit_transform(df[df['subreddit']=='science']['title'])
vectors.A
wc_vec = pd.DataFrame(vectors.A, columns = cv.get_feature_names_out())
wc_vec.sum().sort_values(ascending = False)[0:15].plot(kind = 'bar')\
.set(title = '15 most common words word count -- science')

Above we see that there is not much overlap within the 15 most common words.  This indicates that there is a good chance that these common words will help with distinguishing between the subreddits.  Of note is that proper nouns seem to feature heavily in technology and almost not at all in science.

##### **Modeling**

######

##### **Modeling on title statistics**

In [None]:
title_stats(X_train)

In [None]:
tfidf_vec = TfidfVectorizer(stop_words='english', max_features=1_000,
                             ngram_range=(1,2))
logreg = LogisticRegression(penalty='elasticnet', max_iter=10_000, \
                            random_state=1)

In [None]:
tfidf_lr_params = {"logreg__C": [.1,1,10],
                    "logreg__l1_ratio": [.1,.5,.9],
                   'tfidf_vec__max_df': [.9,.95,1.0],
                   'tfidf_vec__min_df': [.001,.003]}

In [None]:

tfidf_lr_pipe = Pipeline([
    ('tfidf_vec', tfidf_vec),
    ('logreg', LogisticRegression(penalty='none', max_iter=10_000, random_state=33))
])

In [None]:
gs_tfidf_lr = GridSearchCV(tfidf_lr_pipe,tfidf_lr_params)

In [None]:
df.shape, y_train.shape

In [None]:
df.columns

In [None]:
 gs_tfidf_lr.fit(df['title'],y_train)

In [None]:
gs_tfidf_lr.score(df['title'],y_train),gs_tfidf_lr.score(df['title'],y_train)

In [None]:
gs_tfidf_lr.score(X_test_lemmatized,y_test)

In [None]:
tfidf_xgb_pipe = Pipeline([
    ('tfidf_vec', tfidf_vec),
    ('xgb', xgb.XGBClassifier())])
tfidf_xgb_params = {"xgb__eta": [.01,.03],
                    'xgb__booster': ['gbtree','gblinear'],
                   'tfidf_vec__max_df': [.9,.95,1.0],
                   'tfidf_vec__min_df': [.001,.003]}
gs_tfidf_xgb = GridSearchCV(tfidf_xgb_pipe,tfidf_xgb_params)

In [None]:
y_train_xgb = [1 if sub == 'technology' else 0 for sub in y_train]
y_test_xgb = [1 if sub == 'technology' else 0 for sub in y_test]


In [None]:
# gs_tfidf_xgb.fit(X_train_lemmatized,y_train_xgb)

In [None]:
gs_tfidf_xgb.score(X_train_lemmatized,y_train_xgb),gs_tfidf_xgb.score(X_test_lemmatized,y_test_xgb)

In [None]:
gs_tfidf_xgb = GridSearchCV(tfidf_xgb_pipe,{'tfidf_vec__max_df': [0.9],
 'tfidf_vec__min_df': [0.001],
 'xgb__booster': ['gblinear'],
 'xgb__eta': [0.01]})

 

In [None]:
# gs_tfidf_xgb.fit(X_train_lemmatized,y_train_xgb)
gs_tfidf_xgb.score(X_train_lemmatized,y_train_xgb),gs_tfidf_xgb.score(X_test_lemmatized,y_test_xgb)

In [None]:
gs_tfidf_xgb.best_params_

param choices:
tfidf_xgb_params = {"xgb__eta": [.01,.03],
                    'xgb__booster': ['gbtree','gblinear'],
                   'tfidf_vec__max_df': [.9,.95,1.0],
                   'tfidf_vec__min_df': [.001,.003]}
score: 

(0.9143668831168831, 0.8892494929006085)

gs_tfidf_xgb.best_params_ :

{'tfidf_vec__max_df': 0.9,
 'tfidf_vec__min_df': 0.001,
 'xgb__booster': 'gblinear',
 'xgb__eta': 0.01}
 

So far xgboost has provided the best prediction on the test set.  I will try to narrow in on the best parameters.

In [None]:
tfidf_xgb_pipe = Pipeline([
    ('tfidf_vec', tfidf_vec),
    ('xgb', xgb.XGBClassifier(seed = 1))])
tfidf_xgb_params = {"xgb__eta": [.01,.015,.05],
                    'xgb__booster': ['gbtree','gblinear'],
                    # 'xgb__lambda': [.1,1,10],
                    # 'xgb__alpha': [0,.1,1],
                   'tfidf_vec__max_df': [.85,.9,.925],
                   'tfidf_vec__min_df': [.0005,.001,.0015],}
gs_tfidf_xgb = GridSearchCV(tfidf_xgb_pipe,tfidf_xgb_params)

In [None]:
# gs_tfidf_xgb.fit(X_train_lemmatized,y_train_xgb)

In [None]:
gs_tfidf_xgb.best_params_

In [None]:
gs_tfidf_xgb.best_params_
{'tfidf_vec__max_df': 0.85,
 'tfidf_vec__min_df': 0.001,
 'xgb__alpha': 0,
 'xgb__booster': 'gbtree',
 'xgb__eta': 0.05,
 'xgb__lambda': 0.1}


In [None]:
gs_tfidf_xgb.score(X_train_lemmatized,y_train_xgb),gs_tfidf_xgb.score(X_test_lemmatized,y_test_xgb)

In [None]:
# gs_tfidf_xgb.fit()

In [None]:
gs_tfidf_xgb = GridSearchCV(tfidf_xgb_pipe,{'tfidf_vec__max_df': [0.9],
 'tfidf_vec__min_df': [0.001],
 'xgb__booster': ['gblinear'],
 'xgb__eta': [0.005]})

In [None]:
# gs_tfidf_xgb.fit(X_train_lemmatized,y_train_xgb)
gs_tfidf_xgb.score(X_train_lemmatized,y_train_xgb),gs_tfidf_xgb.score(X_test_lemmatized,y_test_xgb)

In [None]:
df

In [None]:
import spacy

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
xgb_pos = xgb.XGBClassifier(seed = 1)
xgb_pos.fit(X_train_pos.a,y_train)

In [None]:
X_train