In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
import numpy as np
import pandas as pd
import re
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.linear_model import LogisticRegression
from textblob import Word, TextBlob
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [2]:
fixed_skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [3]:
train = fetch_20newsgroups()
test = fetch_20newsgroups(subset="test")
X_train, y_train = train['data'], train['target']
X_test, y_test = test['data'], test['target']

## Preprocessing and regex features:

In [4]:
clean_sentences = lambda text: ''.join(filter(lambda x: x.isalpha() or x in [' ', '\n', '.', ',', ':', ',', '!', '?'] or x.isdigit(), text))
get_sentences = lambda text: re.findall(r'[A-Z][^\.]*[\.|\?|\!][$|\s]', text)
get_weird_sentences = lambda text: re.findall(r'[\s|^|\.][A-Z][^\.]*[\.|\?|\!]', text)

In [5]:
def remove_digits(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    return text

In [6]:
def split_specific(data, topics):
    split_txts = []
    for txt in data:
        split_txt = {}
        for topic in topics:
            
            if topic+':' not in txt:
                split_txt[topic] = ''
            else:
                s = txt.split(topic + ':')[1]
                next_topic = re.findall('[a-z\-A-Z]+:',s)
                if next_topic:
#                     s = s.split(re.findall('[a-z\-A-Z]+:',s)[0])[0]
                    s = s.split(re.findall('[a-z\-A-Z]+:|$',s)[0])[0]
                split_txt[topic] = s
                txt = txt.replace(s, ' ').replace(topic+':', ' ')         
        split_txt['the_rest'] = txt
        split_txts += [split_txt]
        
    return split_txts

In [7]:
letters_only = lambda txt: ''.join(filter(lambda c: c.isalpha(), txt)).lower()

In [8]:
uppercase_words = lambda text: len(re.findall('[A-Z]{2,}', text))

In [31]:
def prepare_regex(X):
    df = pd.DataFrame(X, columns=['text'])
    df['text'] = df['text'].apply(clean_sentences)
    df['question_marks_cnt'] = df['text'].str.count('\?')
    df['exclamation_marks_cnt'] = df['text'].str.count('\!')
    df['uppercase_words_count'] = df['text'].apply(uppercase_words)
    df_ = pd.DataFrame(split_specific(df['text'], ['From']))
    df_['From'] = df_['From'].apply(lambda f: ' '.join(f.strip().split(' ')[0].split('.')))
    df_['sentences'] = df_.the_rest.apply(get_sentences)
    df_['sentences_count'] = df_['sentences'].apply(len)
    df_.loc[df_.sentences_count == 0, 'the_rest'] = df_.loc[df_.sentences_count == 0, 'the_rest'].apply(get_weird_sentences)
    df_['sentences_count'] = df_['sentences'].apply(len)
    df_.loc[df_.sentences_count == 0, 'sentences'] = [['']]*df_.loc[df_.sentences_count == 0, 'sentences'].shape[0]
    df_['median_sentence_len'] = df_['sentences'].apply(lambda arr: np.median(np.array([len(item) for item in arr]), axis=0))
    df_['polarity_1_sntc'] = df_['sentences'].apply(lambda s: TextBlob(s[0]).sentiment.polarity if len(s) > 0 else np.nan)
    df_['subjectivity_1_sntc'] = df_['sentences'].apply(lambda s: TextBlob(s[0]).sentiment.subjectivity  if len(s) > 0 else np.nan)
    df_['polarity_last_sntc'] = df_['sentences'].apply(lambda s: TextBlob(s[-1]).sentiment.polarity  if len(s) > 0 else np.nan)
    df_['subjectivity_last_sntc'] = df_['sentences'].apply(lambda s: TextBlob(s[-1]).sentiment.subjectivity  if len(s) > 0 else np.nan)
    df_.drop(['sentences', 'the_rest'], axis=1, inplace=True)
    df = pd.concat([df, df_], axis=1)
    return df

In [13]:
df.head(2)

Unnamed: 0,text,question_marks_cnt,exclamation_marks_cnt,uppercase_words_count,From,sentences_count,median_sentence_len,polarity_1_sntc,subjectivity_1_sntc,polarity_last_sntc,subjectivity_last_sntc
0,From: lerxstwam.umd.edu wheres my thing\nSubje...,1,1,2,lerxstwam umd edu,8,49.0,-0.208333,0.333333,0.0,0.0
1,From: guykuocarson.u.washington.edu Guy Kuo\nS...,0,0,7,guykuocarson u washington edu,4,94.0,0.75,0.95,0.2,0.2


## TF-IDF Vectorizer

In [80]:
df = prepare_regex(X_train)

  return _methods._mean(a, axis=axis, dtype=dtype,


In [81]:
vectorizer = TfidfVectorizer(
                    analyzer='word',
                    ngram_range=(1,2),
                    stop_words='english',
                    vocabulary=None,
                    max_df=0.4, #0.8 
                    min_df = 30, #50
                    max_features=None,
                    smooth_idf=False,
                    norm='l2',
                    preprocessor=remove_digits)
bag = vectorizer.fit_transform(df['text'])
words = pd.DataFrame(bag.astype(np.float32).todense(), columns = vectorizer.get_feature_names_out())

In [82]:
vectorizer = TfidfVectorizer(
                    analyzer='char_wb',
                    ngram_range=(4,9),
                    vocabulary=None,
                    max_df=400, #0.8 
                    min_df = 200, #50
                    max_features=None,
                    smooth_idf=False,
                    norm='l2',
                    preprocessor=letters_only)
bag = vectorizer.fit_transform(df['text'])
chars = pd.DataFrame(bag.astype(np.float32).todense(), columns = vectorizer.get_feature_names_out())

In [83]:
final = pd.concat([words, chars, df.drop(['text', 'From'], axis=1)], axis=1)
final.drop(final.columns[final.columns.duplicated()], axis=1, inplace=True)

In [73]:
imputer = SimpleImputer(strategy='mean')
final_ = imputer.fit_transform(final)
final_ = StandardScaler().fit_transform(final_)
best_final_ = pd.DataFrame(final_, columns=final.columns)

**Best model minus regex**

In [84]:
final = pd.concat([words, chars], axis=1)
final.drop(final.columns[final.columns.duplicated()], axis=1, inplace=True)
imputer = SimpleImputer(strategy='mean')
final_ = imputer.fit_transform(final)
best_final_ = pd.DataFrame(final_, columns=final.columns)

In [93]:
reg = LogisticRegression(penalty='l1',
                         C=0.75,
                         multi_class='ovr', #multinomial, ovr
                         class_weight='balanced',
                         n_jobs=4,
                         random_state=42,
                         solver='liblinear')
logreg_bot_scaled = cross_validate(reg, best_final_, y_train, cv=fixed_skf, scoring='accuracy',verbose=3, return_train_score=True)

In [92]:
pd.DataFrame(logreg_bot_scaled).mean(axis=0) #2

fit_time       6.489713
score_time     0.901423
test_score     0.770461
train_score    0.811605
dtype: float64

In [94]:
reg.fit(final_, y_train)
best_weights = list(zip(np.abs(reg.coef_).sum(axis=0), best_final_.columns))
best_weights.sort(key = lambda x: x[0], reverse=True)



In [109]:
selected_columns = [x[1] for x in best_weights[:3000]]

In [115]:
reg = LogisticRegression(penalty='l2',
                         C=0.0000005,
                         multi_class='multinomial', #multinomial, ovr
                         class_weight='balanced',
                         n_jobs=4,
                         random_state=42)
logreg_best = cross_validate(reg, best_final_[selected_columns], y_train, cv=fixed_skf, scoring='accuracy',verbose=3, return_train_score=True)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ..................., score=(train=0.820, test=0.793) total time=   1.8s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.0s remaining:    0.0s


[CV] END ..................., score=(train=0.830, test=0.793) total time=   1.9s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.2s remaining:    0.0s


[CV] END ..................., score=(train=0.827, test=0.787) total time=   2.2s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    6.8s finished


In [116]:
reg.fit(best_final_[selected_columns], y_train)
y_preds = reg.predict(test_prep[selected_columns])
accuracy_score(y_preds, y_test)

0.7489378651088688

## best model


**Selecting features**

In [122]:
final = pd.concat([words, chars, df.drop(['text', 'From'], axis=1)], axis=1)
final.drop(final.columns[final.columns.duplicated()], axis=1, inplace=True)
imputer = SimpleImputer(strategy='mean')
final_ = imputer.fit_transform(final)
final_ = StandardScaler().fit_transform(final_)
best_final_ = pd.DataFrame(final_, columns=final.columns)

In [123]:
reg = LogisticRegression(penalty='l1',
                         C=0.1,
                         multi_class='ovr', #multinomial, ovr
                         class_weight='balanced',
                         n_jobs=4,
                         random_state=42,
                         solver='liblinear')
reg.fit(final_, y_train)
best_weights = list(zip(np.abs(reg.coef_).sum(axis=0), best_final_.columns))
best_weights.sort(key = lambda x: x[0], reverse=True)



In [124]:
regex_cols = df.drop(['text', 'From'], axis=1).columns.tolist()

In [125]:
best_weights[:5]

[(4.125387278458404, 'sentences_count'),
 (1.4124423860737758, 'windows'),
 (1.2282296483211368, 'sale'),
 (1.1711097826738268, 'dod'),
 (0.9782444557775232, 'car')]

In [126]:
list(filter(lambda x: x[1] in regex_cols, best_weights))

[(4.125387278458404, 'sentences_count'),
 (0.093578618837925, 'uppercase_words_count'),
 (0.059680283522817706, 'question_marks_cnt'),
 (0.02386087730195255, 'subjectivity_1_sntc'),
 (0.0017738851442983998, 'polarity_last_sntc'),
 (0.0, 'exclamation_marks_cnt'),
 (0.0, 'median_sentence_len'),
 (0.0, 'polarity_1_sntc'),
 (0.0, 'subjectivity_last_sntc')]

All but one regex features are useless, but that one, 'sentences_count', is the most important for logreg

**Logistic Regression with 5000 most important features**

In [167]:
selected_columns = [x[1] for x in best_weights[:5000]]

In [168]:
reg = LogisticRegression(penalty='l2',
                         C=0.01,
                         multi_class='multinomial', #multinomial, ovr
                         class_weight='balanced',
                         n_jobs=4,
                         random_state=42)
logistic_reg_results_tfidf = cross_validate(reg, best_final_[selected_columns], y_train, cv=fixed_skf, scoring='accuracy',verbose=3, return_train_score=True)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ..................., score=(train=1.000, test=0.931) total time=   9.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.1s remaining:    0.0s


[CV] END ..................., score=(train=1.000, test=0.934) total time=   9.2s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   18.5s remaining:    0.0s


[CV] END ..................., score=(train=1.000, test=0.926) total time=   9.7s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   28.3s finished


In [169]:
pd.DataFrame(logistic_reg_results_tfidf).mean(axis=0)

fit_time       9.294888
score_time     0.062602
test_score     0.930440
train_score    0.999779
dtype: float64

**Best model interpretation and test score**

In [170]:
reg = LogisticRegression(penalty='l2',
                         C=0.01,
                         multi_class='multinomial', #multinomial, ovr
                         class_weight='balanced',
                         n_jobs=4,
                         random_state=42)
reg.fit(best_final_[selected_columns], y_train)

In [None]:
df = prepare_regex(X_test)

In [139]:
words_vocabulary = list(words.columns)
chars_vocabulary = list(chars.columns)

In [140]:
vectorizer = TfidfVectorizer(
                    analyzer='word',
                    ngram_range=(1,2),
                    stop_words='english',
                    vocabulary=words_vocabulary,
                    max_df=0.4, #0.8 
                    min_df = 30, #50
                    max_features=None,
                    smooth_idf=True,
                    norm='l2',
                    preprocessor=remove_digits)
bag = vectorizer.fit_transform(df['text'])
words = pd.DataFrame(bag.astype(np.float32).todense(), columns = vectorizer.get_feature_names_out())

In [141]:
vectorizer = TfidfVectorizer(
                    analyzer='char_wb',
                    ngram_range=(4,9),
                    vocabulary=chars_vocabulary,
                    max_df=400, #0.8 
                    min_df = 200, #50
                    max_features=None,
                    smooth_idf=True,
                    norm='l2',
                    preprocessor=letters_only)
bag = vectorizer.fit_transform(df['text'])
chars = pd.DataFrame(bag.astype(np.float32).todense(), columns = vectorizer.get_feature_names_out())

In [142]:
final = pd.concat([words, chars, df.drop(['text', 'From'], axis=1)], axis=1)
final.drop(final.columns[final.columns.duplicated()], axis=1, inplace=True)

In [143]:
imputer = SimpleImputer(strategy='mean')
final_ = imputer.fit_transform(final)
final_ = StandardScaler().fit_transform(final_)
test_prep = pd.DataFrame(final_, columns=final.columns)

In [171]:
y_pred = reg.predict(test_prep[selected_columns])

In [172]:
accuracy_score(y_test, y_pred)

0.7955390334572491

3000 features: 0.78

In [173]:
import eli5

In [180]:
eli5.show_weights(reg, feature_names = selected_columns, target_names=train['target_names'])

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0,Unnamed: 19_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5,Unnamed: 17_level_5,Unnamed: 18_level_5,Unnamed: 19_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6,Unnamed: 17_level_6,Unnamed: 18_level_6,Unnamed: 19_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7,Unnamed: 17_level_7,Unnamed: 18_level_7,Unnamed: 19_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8,Unnamed: 17_level_8,Unnamed: 18_level_8,Unnamed: 19_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9,Unnamed: 15_level_9,Unnamed: 16_level_9,Unnamed: 17_level_9,Unnamed: 18_level_9,Unnamed: 19_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10,Unnamed: 16_level_10,Unnamed: 17_level_10,Unnamed: 18_level_10,Unnamed: 19_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11,Unnamed: 15_level_11,Unnamed: 16_level_11,Unnamed: 17_level_11,Unnamed: 18_level_11,Unnamed: 19_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12,Unnamed: 15_level_12,Unnamed: 16_level_12,Unnamed: 17_level_12,Unnamed: 18_level_12,Unnamed: 19_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13,Unnamed: 15_level_13,Unnamed: 16_level_13,Unnamed: 17_level_13,Unnamed: 18_level_13,Unnamed: 19_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14,Unnamed: 15_level_14,Unnamed: 16_level_14,Unnamed: 17_level_14,Unnamed: 18_level_14,Unnamed: 19_level_14
Weight?,Feature,Unnamed: 2_level_15,Unnamed: 3_level_15,Unnamed: 4_level_15,Unnamed: 5_level_15,Unnamed: 6_level_15,Unnamed: 7_level_15,Unnamed: 8_level_15,Unnamed: 9_level_15,Unnamed: 10_level_15,Unnamed: 11_level_15,Unnamed: 12_level_15,Unnamed: 13_level_15,Unnamed: 14_level_15,Unnamed: 15_level_15,Unnamed: 16_level_15,Unnamed: 17_level_15,Unnamed: 18_level_15,Unnamed: 19_level_15
Weight?,Feature,Unnamed: 2_level_16,Unnamed: 3_level_16,Unnamed: 4_level_16,Unnamed: 5_level_16,Unnamed: 6_level_16,Unnamed: 7_level_16,Unnamed: 8_level_16,Unnamed: 9_level_16,Unnamed: 10_level_16,Unnamed: 11_level_16,Unnamed: 12_level_16,Unnamed: 13_level_16,Unnamed: 14_level_16,Unnamed: 15_level_16,Unnamed: 16_level_16,Unnamed: 17_level_16,Unnamed: 18_level_16,Unnamed: 19_level_16
Weight?,Feature,Unnamed: 2_level_17,Unnamed: 3_level_17,Unnamed: 4_level_17,Unnamed: 5_level_17,Unnamed: 6_level_17,Unnamed: 7_level_17,Unnamed: 8_level_17,Unnamed: 9_level_17,Unnamed: 10_level_17,Unnamed: 11_level_17,Unnamed: 12_level_17,Unnamed: 13_level_17,Unnamed: 14_level_17,Unnamed: 15_level_17,Unnamed: 16_level_17,Unnamed: 17_level_17,Unnamed: 18_level_17,Unnamed: 19_level_17
Weight?,Feature,Unnamed: 2_level_18,Unnamed: 3_level_18,Unnamed: 4_level_18,Unnamed: 5_level_18,Unnamed: 6_level_18,Unnamed: 7_level_18,Unnamed: 8_level_18,Unnamed: 9_level_18,Unnamed: 10_level_18,Unnamed: 11_level_18,Unnamed: 12_level_18,Unnamed: 13_level_18,Unnamed: 14_level_18,Unnamed: 15_level_18,Unnamed: 16_level_18,Unnamed: 17_level_18,Unnamed: 18_level_18,Unnamed: 19_level_18
Weight?,Feature,Unnamed: 2_level_19,Unnamed: 3_level_19,Unnamed: 4_level_19,Unnamed: 5_level_19,Unnamed: 6_level_19,Unnamed: 7_level_19,Unnamed: 8_level_19,Unnamed: 9_level_19,Unnamed: 10_level_19,Unnamed: 11_level_19,Unnamed: 12_level_19,Unnamed: 13_level_19,Unnamed: 14_level_19,Unnamed: 15_level_19,Unnamed: 16_level_19,Unnamed: 17_level_19,Unnamed: 18_level_19,Unnamed: 19_level_19
+0.107,atheis,,,,,,,,,,,,,,,,,,
+0.100,atheism,,,,,,,,,,,,,,,,,,
+0.096,keithcco,,,,,,,,,,,,,,,,,,
+0.089,subject thoughts,,,,,,,,,,,,,,,,,,
+0.089,heist,,,,,,,,,,,,,,,,,,
+0.086,axes,,,,,,,,,,,,,,,,,,
+0.083,charley wingate,,,,,,,,,,,,,,,,,,
+0.081,edu keith,,,,,,,,,,,,,,,,,,
+0.080,schneider,,,,,,,,,,,,,,,,,,
+0.080,osrhe,,,,,,,,,,,,,,,,,,

Weight?,Feature
+0.107,atheis
+0.100,atheism
+0.096,keithcco
+0.089,subject thoughts
+0.089,heist
+0.086,axes
+0.083,charley wingate
+0.081,edu keith
+0.080,schneider
+0.080,osrhe

Weight?,Feature
+0.199,graphics
+0.118,images
+0.117,image
+0.114,<BIAS>
+0.102,philosophical
+0.101,tiff
+0.090,files
+0.088,ytra
+0.086,xv
+0.086,acta

Weight?,Feature
+0.239,windows
+0.135,cica
+0.114,rwindows
+0.106,bj
+0.091,file
+0.090,win
+0.086,orwin
+0.085,forwi
+0.085,driver
+0.082,subject windows

Weight?,Feature
+0.143,<BIAS>
+0.112,dx
+0.105,cdrom
+0.094,nitor
+0.089,gateway
+0.083,os
+0.081,vlb
+0.081,disk
+0.080,bios
+0.078,pc

Weight?,Feature
+0.202,mac
+0.159,apple
+0.144,macs
+0.140,<BIAS>
+0.137,duo
+0.137,centris
+0.129,lc
+0.121,powerbook
+0.111,quadra
+0.105,lciii

Weight?,Feature
+0.155,thex
+0.138,server
+0.138,xr
+0.131,widget
+0.128,window
+0.119,xter
+0.114,motif
+0.112,otif
+0.090,widgets
+0.081,application

Weight?,Feature
+0.222,sale
+0.101,forsale
+0.099,aleo
+0.093,sell
+0.088,brand new
+0.087,offer
+0.080,subject wanted
+0.079,wanted
+0.078,askin
+0.078,efors

Weight?,Feature
+0.243,car
+0.178,<BIAS>
+0.107,warning read
+0.103,subject warning
+0.098,thecar
+0.097,engine
+0.094,toyota
+0.093,auto
+0.090,autom
+0.078,trunk

Weight?,Feature
+0.271,dod
+0.162,bikes
+0.143,<BIAS>
+0.128,motorc
+0.126,bmw
+0.119,ride
+0.106,cycl
+0.106,riding
+0.105,yamaha
+0.097,motorcycle

Weight?,Feature
+0.111,cubs
+0.106,pitc
+0.098,eball
+0.098,basebal
+0.098,baseba
+0.097,aseball
+0.094,phillies
+0.087,tigers
+0.083,giants
+0.076,braves

Weight?,Feature
+0.147,nhl
+0.129,hocke
+0.126,ockey
+0.121,hock
+0.090,game
+0.089,team
+0.087,detroit
+0.086,playo
+0.083,espn
+0.082,seaso

Weight?,Feature
+0.110,pgp
+0.104,key
+0.091,rypto
+0.091,ypto
+0.090,ncry
+0.090,encry
+0.086,ncryp
+0.086,encryp
+0.086,ncrypt
+0.084,wiretap

Weight?,Feature
+0.203,<BIAS>
+0.112,circuit
+0.093,circu
+0.091,scop
+0.085,tv
+0.080,electronics
+0.080,power
+0.077,voltage
+0.075,ee
+0.075,otoro

Weight?,Feature
+0.163,<BIAS>
+0.155,octor
+0.113,medic
+0.100,treatment
+0.100,disease
+0.097,msg
+0.096,foods
+0.095,subject msg
+0.086,diet
+0.081,cancer

Weight?,Feature
+0.198,space
+0.113,espace
+0.106,rbit
+0.104,orbi
+0.100,moon
+0.096,dcx
+0.096,lanet
+0.095,subject space
+0.093,launc
+0.090,sky

Weight?,Feature
+0.169,clh
+0.101,athos
+0.101,athos rutgers
+0.099,geneva rutgers
+0.090,christians
+0.084,churc
+0.082,hell organization
+0.082,nchrist
+0.081,hurch
+0.078,god

Weight?,Feature
+0.137,gun
+0.107,waco
+0.101,subject gun
+0.098,cdtsw stratus
+0.098,firea
+0.098,egun
+0.091,rearm
+0.089,politics guns
+0.087,handgun
+0.084,batffbi

Weight?,Feature
+0.120,srael
+0.120,srae
+0.120,israe
+0.119,rael
+0.109,subject israeli
+0.093,hezbollah
+0.088,serdar
+0.081,turk
+0.079,meni
+0.079,lesti

Weight?,Feature
+0.102,topt
+0.095,drugs
+0.094,linton
+0.090,aldis
+0.084,sexu
+0.078,com clayton
+0.074,arecom
+0.072,clayton
+0.072,verdict
+0.070,tax

Weight?,Feature
+0.104,know happened
+0.089,edu tony
+0.086,beast
+0.086,weiss
+0.083,subject years
+0.082,say christian
+0.081,biblical
+0.080,age
+0.079,ndig
+0.077,dece


In [190]:
best_final_.loc[best_final_['pittedu'] > 0].head(3)

Unnamed: 0,aa,aaa,aaron,ab,abandoned,abc,ability,able,able use,abortion,...,zine,question_marks_cnt,exclamation_marks_cnt,uppercase_words_count,sentences_count,median_sentence_len,polarity_1_sntc,subjectivity_1_sntc,polarity_last_sntc,subjectivity_last_sntc
53,-0.0613,-0.051056,-0.071476,-0.078557,-0.041258,-0.050063,-0.114927,-0.239743,-0.050097,-0.052426,...,-0.120898,-0.093352,-0.086395,-0.092054,-0.03781,-0.461526,0.123626,-1.296358,-0.211684,0.698994
177,-0.0613,-0.051056,-0.071476,-0.078557,-0.041258,-0.050063,-0.114927,-0.239743,-0.050097,-0.052426,...,-0.120898,-0.049794,-0.086395,-0.080659,-0.221437,-0.11056,-0.720481,0.160459,0.139711,-0.43588
251,-0.0613,-0.051056,-0.071476,-0.078557,-0.041258,-0.050063,-0.114927,-0.239743,-0.050097,-0.052426,...,-0.120898,-0.093352,-0.086395,-0.074961,0.145817,-0.126884,-0.298427,0.524664,0.420827,0.772212


The names of the classes not once are the most weighted words in a bag, so maybe it would be a good idea to make a feature out of a class name. The model also values 'edu keith' in a class named 'atheism'. That probably indicates that trainset has a lot of emails from 'edu keith' featuring that theme. It's bad for generalization, because Keith is a person, so maybe it would be a good idea to extract email adresses from the set. Also, 'caltech edu' refers to 'keith edu' as that is one man's email. The reason I left the emails was to spot organizations that are most active in a particular field. Like 'pitt' which is a domen for Univercity of Pittsburg Computer Science, but that adress is valued high in 'medical' class, which doesn't make sense logically. In other cases, it's pretty self-explanatory and seems reasonable: class 'baseball' values 'sox' and 'stadium', 'guns' - 'cnn' and 'fbi', 'mideast' - 'israel'.

In [193]:
X_train[177][:200]

'From: geb@cs.pitt.edu (Gordon Banks)\nSubject: Re: tuberculosis\nReply-To: geb@cs.pitt.edu (Gordon Banks)\nOrganization: Univ. of Pittsburgh Computer Science\nLines: 20\n\nIn article <1993Mar25.020646.852@n'

In [195]:
train['target_names'][y_train[177]]

'sci.med'

**Logistic Regression, not scaled**

In [33]:
imputer = SimpleImputer(strategy='mean')
final_ = imputer.fit_transform(final)
# final_ = StandardScaler().fit_transform(final_)
final_ = pd.DataFrame(final_, columns=final.columns)

In [37]:
reg = LogisticRegression(penalty='l2',
                         C=0.01,
                         multi_class='multinomial', #multinomial, ovr
                         class_weight='balanced',
                         n_jobs=4,
                         random_state=42)
logreg_bot_scaled = cross_validate(reg, final_[selected_columns], y_train, cv=fixed_skf, scoring='accuracy',verbose=3, return_train_score=True)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ..................., score=(train=0.174, test=0.169) total time=  15.9s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   16.1s remaining:    0.0s


[CV] END ..................., score=(train=0.151, test=0.152) total time=  17.4s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   33.8s remaining:    0.0s


[CV] END ..................., score=(train=0.158, test=0.160) total time=  17.8s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   51.8s finished


In [38]:
pd.DataFrame(logreg_bot_scaled).mean(axis=0)

fit_time       17.093319
score_time      0.056109
test_score      0.160243
train_score     0.160952
dtype: float64

**Logistic Regression, chars_wb + regex**

In [35]:
final = pd.concat([chars, df.drop(['text', 'From'], axis=1)], axis=1)
final.drop(final.columns[final.columns.duplicated()], axis=1, inplace=True)
imputer = SimpleImputer(strategy='mean')
final_ = imputer.fit_transform(final)
final_ = StandardScaler().fit_transform(final_)
final_ = pd.DataFrame(final_, columns=final.columns)

In [40]:
reg = LogisticRegression(penalty='l1',
                         C=0.01,
                         multi_class='ovr', #multinomial, ovr
                         class_weight='balanced',
                         n_jobs=4,
                         random_state=42,
                         solver='liblinear')
logistic_reg_results = cross_validate(reg, final_, y_train, cv=fixed_skf, scoring='accuracy',verbose=3, return_train_score=True)

In [39]:
pd.DataFrame(logistic_reg_results).mean(axis=0)

fit_time       145.164439
score_time       0.361391
test_score       0.704613
train_score      0.757955
dtype: float64

In [41]:
reg.fit(final_, y_train)
weights = list(zip(np.abs(reg.coef_).sum(axis=0), final_.columns))
weights.sort(key = lambda x: x[0], reverse=True)



5000 feautres overfitted the model:

In [50]:
pd.DataFrame(chars_tfidf).mean(axis=0)

fit_time       13.519301
score_time      0.069105
test_score      0.836132
train_score     0.999735
dtype: float64

In [64]:
selected_columns = [x[1] for x in weights[:2000]]

In [67]:
reg = LogisticRegression(penalty='l2',
                         C=0.00005,
                         multi_class='multinomial', #multinomial, ovr
                         class_weight='balanced',
                         n_jobs=4,
                         random_state=42)
chars_tfidf = cross_validate(reg, final_[selected_columns], y_train, cv=fixed_skf, scoring='accuracy',verbose=3, return_train_score=True)

In [66]:
pd.DataFrame(chars_tfidf).mean(axis=0)

fit_time       2.680824
score_time     0.020952
test_score     0.796005
train_score    0.853191
dtype: float64

**Logistic Regression, words + regex**

In [17]:
final = pd.concat([words, df.drop(['text', 'From'], axis=1)], axis=1)
final.drop(final.columns[final.columns.duplicated()], axis=1, inplace=True)
imputer = SimpleImputer(strategy='mean')
final_ = imputer.fit_transform(final)
final_ = StandardScaler().fit_transform(final_)
final_ = pd.DataFrame(final_, columns=final.columns)

In [23]:
reg = LogisticRegression(penalty='l1',
                         C=0.01,
                         multi_class='ovr', #multinomial, ovr
                         class_weight='balanced',
                         n_jobs=4,
                         random_state=42,
                         solver='liblinear')
logistic_reg_results = cross_validate(reg, final_, y_train, cv=fixed_skf, scoring='accuracy',verbose=3, return_train_score=True)

In [24]:
pd.DataFrame(logistic_reg_results).mean(axis=0)

fit_time       51.055507
score_time      0.109645
test_score      0.790613
train_score     0.840993
dtype: float64

In [25]:
reg.fit(final_, y_train)
weights = list(zip(np.abs(reg.coef_).sum(axis=0), final_.columns))
weights.sort(key = lambda x: x[0], reverse=True)

In [32]:
selected_columns = [x[1] for x in weights[:2000]]

In [38]:
reg = LogisticRegression(penalty='l2',
                         C=0.0000005,
                         multi_class='multinomial', #multinomial, ovr
                         class_weight='balanced',
                         n_jobs=4,
                         random_state=42)
words_tfidf = cross_validate(reg, final_[selected_columns], y_train, cv=fixed_skf, scoring='accuracy',verbose=3, return_train_score=True)

In [40]:
pd.DataFrame(words_tfidf).mean(axis=0)

fit_time       1.550950
score_time     0.020808
test_score     0.833657
train_score    0.889296
dtype: float64

**Logistic Regression, chars + regex**

In [41]:
vectorizer = TfidfVectorizer(
                    analyzer='char',
                    ngram_range=(4,9),
                    vocabulary=None,
                    max_df=400, #0.8 
                    min_df = 200, #50
                    max_features=None,
                    smooth_idf=False,
                    norm='l2',
                    preprocessor=letters_only)
bag = vectorizer.fit_transform(df['text'])
_chars_ = pd.DataFrame(bag.astype(np.float32).todense(), columns = vectorizer.get_feature_names_out())

In [43]:
_chars_.head(1)

Unnamed: 0,abad,abas,abel,abet,abett,abette,abetter,abig,abilityt,ablean,...,zationin,zationl,zationne,zationo,zationr,zationst,zationw,zens,zethe,zine
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
final = pd.concat([_chars_, df.drop(['text', 'From'], axis=1)], axis=1)
final.drop(final.columns[final.columns.duplicated()], axis=1, inplace=True)
imputer = SimpleImputer(strategy='mean')
final_ = imputer.fit_transform(final)
final_ = StandardScaler().fit_transform(final_)
final_ = pd.DataFrame(final_, columns=final.columns)

In [47]:
reg = LogisticRegression(penalty='l1',
                         C=0.01,
                         multi_class='ovr', #multinomial, ovr
                         class_weight='balanced',
                         n_jobs=4,
                         random_state=42,
                         solver='liblinear')
logistic_reg_results = cross_validate(reg, final_, y_train, cv=fixed_skf, scoring='accuracy',verbose=3, return_train_score=True)

In [48]:
pd.DataFrame(logistic_reg_results).mean(axis=0)

fit_time       144.440739
score_time       0.264891
test_score       0.704613
train_score      0.757822
dtype: float64

In [49]:
reg.fit(final_, y_train)
weights = list(zip(np.abs(reg.coef_).sum(axis=0), final_.columns))
weights.sort(key = lambda x: x[0], reverse=True)



In [63]:
selected_columns = [x[1] for x in weights[:2000]]

In [67]:
reg = LogisticRegression(penalty='l2',
                         C=0.00007,
                         multi_class='multinomial', #multinomial, ovr
                         class_weight='balanced',
                         n_jobs=4,
                         random_state=42)
_chars_tfidf = cross_validate(reg, final_[selected_columns], y_train, cv=fixed_skf, scoring='accuracy',verbose=3, return_train_score=True)

In [66]:
pd.DataFrame(_chars_tfidf).mean(axis=0)

fit_time       2.513647
score_time     0.025955
test_score     0.801396
train_score    0.861145
dtype: float64