In [135]:
import requests
import time
import pandas as pd
import numpy as np
import json
import seaborn as sns
import random
import matplotlib.pyplot as plt

from os import path
from PIL import Image
import os

from wordcloud import WordCloud, STOPWORDS


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

import re

from bs4 import BeautifulSoup
from nltk.corpus import stopwords


import warnings
warnings.simplefilter(action='ignore')

In [413]:
redditposts_df = pd.read_csv('./dataset/cleansed_combined_angermed_posts.csv')
redditposts_df.head(10)

Unnamed: 0,posts,target
0,If you're angry *about* something or something...,1
1,thistextisfiller,1
2,I've nearly gotten myself in bad trouble more ...,1
3,I had an uncomfortable moment with some people...,1
4,Hi all. I'm a very mellow person who doesn't ...,1
5,I'm done. I can't cope. I'm so angry and I can...,1
6,I have been feeling on edge lately and a few d...,1
7,Growing up I didn't have much of really any su...,1
8,I'd been dating this guy and things ended last...,1
9,A problem ive been having again recently is d...,1


#### Do train-test split

In [414]:
features = ['posts']
X=redditposts_df[features]
y=redditposts_df['target']
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,stratify=y)
X_train

Unnamed: 0,posts
921,Yesterday was not any different than any other...
2934,I fell out of practice; first time meditating ...
3343,Alan Watts | It's Just A Show
606,"I hate doing my bills. Some, like my car paym..."
1618,Fuse blew on 3
3434,Is Headspace really that great? The courses ar...
965,I struggle with explosive anger. I've spent ye...
2952,Gap between losing it and finding it again
1601,How to help my boyfriend with his anger?
3246,Why is it so difficult for me to meditate lyin...


In [562]:
X_TEST_List = ['Anger damages your brain. Must control anger',
               'Meditation made me feel happy. I am relaxed',
              'More anxiety means more harm to body. Must relax mind with aroma and sound',
              'Got into fight after food discussion. Mood was upset']
X_TEST_TO_PREDICT = pd.DataFrame(X_TEST_List,columns=['posts'])
X_TEST_TO_PREDICT

Unnamed: 0,posts
0,Anger damages your brain. Must control anger
1,Meditation made me feel happy. I am relaxed
2,More anxiety means more harm to body. Must rel...
3,Got into fight after food discussion. Mood was...


#### Steps of function  - remove html code artefacts, remove non-letters, convert into lowercase,  remove stop words

In [563]:
def posts_to_words(raw_posts):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    
    # 1. Remove HTML.
    posts_text = BeautifulSoup(raw_posts).get_text()
    
    # 2. Remove non-letters.
    letters_only = re.sub("[^a-zA-Z]", " ", posts_text)
    
    # 3. Convert to lower case, split into individual words.
    words = letters_only.lower().split()
    
    # 4. In Python, searching a set is much faster than searching
    # a list, so convert the stop words to a set.
    stops = set(stopwords.words('english'))
    
    # 5. Remove stop words.
    meaningful_words = [w for w in words if not w in stops]
    
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return(" ".join(meaningful_words))

#### Apply function

In [564]:
# Get the number of reviews based on the dataframe size.
total_posts = X_train.shape[0]
print(f'There are {total_posts} posts.')

# Initialize an empty list to hold the clean reviews.
clean_train_posts = []
clean_test_posts = []

clean_XTEST_posts = []

There are 2847 posts.


In [565]:
print("Cleaning and parsing the training set posts...")

j = 0
for train_posts in X_train['posts']:
    # Convert review to words, then append to clean_train_reviews.
    clean_train_posts.append(review_to_words(train_posts))
    
    # If the index is divisible by 1000, print a message
    if (j + 1) % 1000 == 0:
        print(f'Posts {j + 1} of {total_posts}.')
    
    j += 1

# Let's do the same for our testing set.

print("Cleaning and parsing the testing set posts...")

for test_posts in X_test['posts']:
    # Convert review to words, then append to clean_train_reviews.
    clean_test_posts.append(posts_to_words(test_posts))
    
    # If the index is divisible by 1000, print a message
    if (j + 1) % 1000 == 0:
        print(f'Posts {j + 1} of {total_posts}.')
        
    j += 1

Cleaning and parsing the training set posts...
Posts 1000 of 2847.
Posts 2000 of 2847.
Cleaning and parsing the testing set posts...
Posts 3000 of 2847.


In [566]:
for X_TEST_posts in X_TEST_TO_PREDICT['posts']:
    # Convert review to words, then append to clean_train_reviews.
    clean_XTEST_posts.append(posts_to_words(X_TEST_posts))
    
    # If the index is divisible by 1000, print a message
    if (j + 1) % 1000 == 0:
        print(f'Posts {j + 1} of {X_TEST_posts}.')
        
    j += 1

#### Use CountVectorizer

In [567]:
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 5000) 

In [568]:
train_data_features = vectorizer.fit_transform(clean_train_posts)

test_data_features = vectorizer.transform(clean_test_posts)

# Numpy arrays are easy to work with, so convert the result to an 
# array.
train_data_features = train_data_features.toarray()

In [569]:
X_TEST_data_features = vectorizer.transform(clean_XTEST_posts)

In [570]:
print(train_data_features.shape)

(2847, 5000)


In [571]:
print(test_data_features.shape)

(949, 5000)


In [572]:
vocab = vectorizer.get_feature_names()
print(vocab)



In [573]:
lr = LogisticRegression()

In [574]:
lr.fit(train_data_features, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [575]:
lr.score(train_data_features, y_train)

0.9848963821566561

In [576]:
lr.score(test_data_features, y_test)

0.9178082191780822

In [577]:
predictions=lr.predict(test_data_features)

In [578]:
print(confusion_matrix(y_test, predictions))

[[425  35]
 [ 43 446]]


In [579]:
type(test_data_features)

scipy.sparse.csr.csr_matrix

In [580]:
type(X_TEST_data_features)

scipy.sparse.csr.csr_matrix

In [581]:
print(lr.predict(X_TEST_data_features.toarray()))

[1 0 0 1]


In [501]:
clean_XTEST_posts

['meditation helps combating anxiety releaseing pressure']

In [502]:
X_TEST_data_features.data

array([1, 1, 1, 1])

In [None]:
 wc = WordCloud(max_words= 100,
                          width = 744, 
                          height = 544,
                          background_color ='white',
                          stopwords=stopwords, 
                          contour_width=3, 
                          contour_color='steelblue',
                          min_font_size = 10).generate(text) 

In [589]:
def l_Regression(model_num):
    if model_num == 'l1':
        pipe = Pipeline([
            ('lr',LogisticRegression())
        ])

        pipe_params = {
            'lr__penalty': [model_num] #lasso
#             'lr__C': [1, 1.5, 2, 2.5],
#             'lr__class_weight': ['balanced'],
#             'lr__warm_start': [True, False],
#             'lr__random_state': [42],
#             'lr__solver': ['liblinear']
        }
    elif model_num == 'l2':
        pipe = Pipeline([
            ('lr',LogisticRegression())
        ])

        pipe_params = {
            'lr__penalty': ['l2'] #ridge
#             'lr__C': [1, 1.5, 2, 2.5],
#             'lr__class_weight': ['balanced'],
#             'lr__warm_start': [True, False],
#             'lr__random_state': [42],
#             'lr__solver': ['lbfgs', 'liblinear']
        }
    else:
        print('Error due to invalid model number passed. Please check')
        return
    
    gsv = GridSearchCV(pipe,param_grid=pipe_params,cv=5)
    gsv.fit(train_data_features,y_train)
    print('training score {0}: '.format(gsv.score(train_data_features,y_train)))
    print('test score {0}: '.format(gsv.score(test_data_features,y_test)))
    
    predictions = gsv.predict(test_data_features)
    print('--------')
    print(confusion_matrix(y_test, predictions))
    print(f'Best params = {gsv.best_params_}')
    print(lr.predict(X_TEST_data_features.toarray()))
    reddit_category = ['Meditation','Anger'] # As set 1 - 'Anger' , 0 - 'Meditation'
    X_TEST_PREDICT = gsv.predict(X_TEST_data_features.toarray())
    X_TEST_PREDICT_RESULT = [reddit_category[predict] for predict in X_TEST_PREDICT]
    print('Given list for prediction: {0}'.format(X_TEST_List))
    print('predictions for Manual Test: {0}'.format(X_TEST_PREDICT_RESULT))

In [590]:
l_Regression('l1')

training score 0.9490691956445381: 
test score 0.8925184404636459: 
--------
[[428  32]
 [ 70 419]]
Best params = {'lr__penalty': 'l1'}
[1 0 0 1]
Given list for prediction: ['Anger damages your brain. Must control anger', 'Meditation made me feel happy. I am relaxed', 'More anxiety means more harm to body. Must relax mind with aroma and sound', 'Got into fight after food discussion. Mood was upset']
predictions for Manual Test: ['Anger', 'Meditation', 'Meditation', 'Anger']


In [591]:
l_Regression('l2')

training score 0.9848963821566561: 
test score 0.9178082191780822: 
--------
[[425  35]
 [ 43 446]]
Best params = {'lr__penalty': 'l2'}
[1 0 0 1]
Given list for prediction: ['Anger damages your brain. Must control anger', 'Meditation made me feel happy. I am relaxed', 'More anxiety means more harm to body. Must relax mind with aroma and sound', 'Got into fight after food discussion. Mood was upset']
predictions for Manual Test: ['Anger', 'Meditation', 'Meditation', 'Anger']


In [361]:
redditposts_df['posts'] = redditposts_df.posts.map(lambda x: x.lower())
redditposts_df

Unnamed: 0,posts,target
0,if you're angry *about* something or something...,1
1,thistextisfiller,1
2,i've nearly gotten myself in bad trouble more ...,1
3,i had an uncomfortable moment with some people...,1
4,hi all. i'm a very mellow person who doesn't ...,1
5,i'm done. i can't cope. i'm so angry and i can...,1
6,i have been feeling on edge lately and a few d...,1
7,growing up i didn't have much of really any su...,1
8,i'd been dating this guy and things ended last...,1
9,a problem ive been having again recently is d...,1


#### clean the post text

In [349]:
def review_to_words(raw_review):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    
    # 1. Remove HTML.
    review_text = BeautifulSoup(raw_review).get_text()
    
    # 2. Remove non-letters.
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    
    # 3. Convert to lower case, split into individual words.
    words = letters_only.lower().split()
    
    # 4. In Python, searching a set is much faster than searching
    # a list, so convert the stop words to a set.
    stops = set(stopwords.words('english'))
    
    # 5. Remove stop words.
    meaningful_words = [w for w in words if not w in stops]
    
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return(" ".join(meaningful_words))

In [350]:
redditposts_df.posts = redditposts_df.posts.map(lambda x: review_to_words(x))
redditposts_df

Unnamed: 0,posts,target
0,angry something something really bugs want ran...,1
1,thistextisfiller,1
2,nearly gotten bad trouble road rage one techni...,1
3,uncomfortable moment people situation felt lou...,1
4,hi mellow person really anger problems essenti...,1
5,done cope angry cope done,1
6,feeling edge lately days ago chose activity kn...,1
7,growing much really support parents dealt lot ...,1
8,dating guy things ended last night things weir...,1
9,problem ive recently dying losing shooting gam...,1


In [287]:
# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [288]:
stop_words

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

#### Define features and to train-test-split

In [390]:
features = ['posts']
X=redditposts_df[features]
y=redditposts_df['target']

In [391]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,stratify=y)

In [392]:
X_train

Unnamed: 0,posts
921,Yesterday was not any different than any other...
2934,I fell out of practice; first time meditating ...
3343,Alan Watts | It's Just A Show
606,"I hate doing my bills. Some, like my car paym..."
1618,Fuse blew on 3
3434,Is Headspace really that great? The courses ar...
965,I struggle with explosive anger. I've spent ye...
2952,Gap between losing it and finding it again
1601,How to help my boyfriend with his anger?
3246,Why is it so difficult for me to meditate lyin...


In [323]:
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, 
                             min_df=0.02,
                             max_df=0.9,
                             ngram_range=(1,1))

In [324]:
X_train_vect_posts = bow_vector.fit_transform(X_train.posts)

X_test_vect_posts = bow_vector.transform(X_test.posts)

In [325]:
bow_vector.get_feature_names()

['..',
 '...',
 '1',
 '10',
 '2',
 '3',
 '4',
 '5',
 'able',
 'actually',
 'advice',
 'ago',
 'amp;#x200b',
 'anger',
 'angry',
 'anxiety',
 'anymore',
 'asked',
 'aware',
 'away',
 'bad',
 'bed',
 'believe',
 'best',
 'better',
 'big',
 'bit',
 'body',
 'brain',
 'break',
 'breath',
 'breathing',
 'broke',
 'called',
 'calm',
 'came',
 'care',
 'cause',
 'change',
 'close',
 'come',
 'comes',
 'coming',
 'completely',
 'control',
 'couple',
 'dad',
 'daily',
 'day',
 'days',
 'deal',
 'decided',
 'deep',
 'depression',
 'different',
 'emotions',
 'end',
 'especially',
 'etc',
 'example',
 'experience',
 'extremely',
 'eyes',
 'face',
 'family',
 'far',
 'feel',
 'feeling',
 'feelings',
 'feels',
 'felt',
 'fight',
 'finally',
 'find',
 'focus',
 'found',
 'friend',
 'friends',
 'fuck',
 'fucking',
 'games',
 'gets',
 'getting',
 'goes',
 'going',
 'gone',
 'good',
 'got',
 'gotten',
 'great',
 'guy',
 'guys',
 'hand',
 'happen',
 'happened',
 'happens',
 'happy',
 'hard',
 'hate',
 'h

In [303]:
len(bow_vector.get_feature_names())

273

In [279]:
X_train_posts_df = pd.DataFrame(X_train_vect_posts.todense(),
                               columns = [feature + '_posts' for feature in bow_vector.get_feature_names()])
X_train_posts_df

Unnamed: 0,..._posts,10_posts,2_posts,3_posts,able_posts,actually_posts,advice_posts,ago_posts,anger_posts,angry_posts,...,went_posts,work_posts,working_posts,world_posts,worse_posts,wrong_posts,year_posts,years_posts,“_posts,”_posts
0,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,2,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,5,0,...,0,1,0,0,0,0,0,1,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [280]:
X_test_posts_df = pd.DataFrame(X_test_vect_posts.todense(),
                               columns = [feature + '_posts' for feature in bow_vector.get_feature_names()])
X_test_posts_df

Unnamed: 0,..._posts,10_posts,2_posts,3_posts,able_posts,actually_posts,advice_posts,ago_posts,anger_posts,angry_posts,...,went_posts,work_posts,working_posts,world_posts,worse_posts,wrong_posts,year_posts,years_posts,“_posts,”_posts
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8,2,0,0,0,1,0,1,0,4,0,...,2,0,0,0,0,0,0,0,0,0
9,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [211]:
redditposts_df

Unnamed: 0,posts,target
0,If you're angry *about* something or something...,1
1,thistextisfiller,1
2,I've nearly gotten myself in bad trouble more ...,1
3,I had an uncomfortable moment with some people...,1
4,Hi all. I'm a very mellow person who doesn't ...,1
5,I'm done. I can't cope. I'm so angry and I can...,1
6,I have been feeling on edge lately and a few d...,1
7,Growing up I didn't have much of really any su...,1
8,I'd been dating this guy and things ended last...,1
9,A problem ive been having again recently is d...,1


### Apply CountVectorizer on title and text separately
This is to ensure that words of title an dtext are determined separately and tokens are not missed

In [406]:
# dictionary_filepath = 'my_unigram_dictionary'
# vocabulary_to_load = pickle.load(open(dictionary_filepath, 'r'))
# cvec_posts = CountVectorizer(analyzer = "word",
#                              tokenizer = None,
#                              preprocessor = None,
#                              stop_words = None,
#                              #max_features = 5000, 
#                              min_df=0.01,
#                              max_df=0.99,
#                              ngram_range=(1,3)
#                             )
cvec_posts = CountVectorizer(stop_words='english',
                            analyzer='word',
                             min_df=.03,max_df=.9,ngram_range=(1,3))
#cvec_title = CountVectorizer(stop_words='english',strip_accents='ascii',min_df=.03,max_df=.9,ngram_range=(1,3))

In [407]:
X_train_posts_cvec = cvec_posts.fit_transform(X_train.posts)
#X_train_title_cvec = cvec_title.fit_transform(X_train.title)

X_test_posts_cvec = cvec_posts.transform(X_test.posts)
#X_test_title_cvec = cvec_title.transform(X_test.title)

In [408]:
#cvec_posts._validate_vocabulary()
cvec_posts.get_feature_names()

['10',
 'able',
 'actually',
 'advice',
 'ago',
 'amp',
 'anger',
 'anger issues',
 'angry',
 'anxiety',
 'anymore',
 'away',
 'bad',
 'best',
 'better',
 'body',
 'breath',
 'breathing',
 'calm',
 'come',
 'control',
 'day',
 'days',
 'deal',
 'did',
 'didn',
 'different',
 'does',
 'doesn',
 'doing',
 'don',
 'don know',
 'don want',
 'emotions',
 'end',
 'experience',
 'family',
 'feel',
 'feel like',
 'feeling',
 'feelings',
 'feels',
 'felt',
 'focus',
 'friends',
 'fucking',
 'getting',
 'going',
 'good',
 'got',
 'great',
 'guys',
 'happened',
 'hard',
 'hate',
 'having',
 'head',
 'help',
 'hit',
 'home',
 'https',
 'hurt',
 'im',
 'isn',
 'issues',
 'just',
 'kind',
 'know',
 'let',
 'life',
 'like',
 'little',
 'live',
 'll',
 'long',
 'look',
 'looking',
 'lost',
 'lot',
 'love',
 'mad',
 'make',
 'makes',
 'making',
 'maybe',
 'meditate',
 'meditating',
 'meditation',
 'mind',
 'minutes',
 'moment',
 'months',
 'need',
 'negative',
 'new',
 'parents',
 'past',
 'people',
 '

In [409]:
len(cvec_posts.get_feature_names())

160

In [25]:
#cvec_title.get_feature_names()

['advice',
 'career',
 'company',
 'help',
 'interview',
 'job',
 'jobs',
 'need',
 'new',
 'offer',
 'want',
 'work']

#### X_train_text_cvec is a sparse matrix

In [227]:
X_train_posts_cvec.data

array([1, 1, 1, ..., 1, 1, 1])

#### Create a dataframe from this sparse matrix with feature names and suffix as text to view it better

In [397]:
X_train_posts_df = pd.DataFrame(X_train_posts_cvec.todense(),
                               columns = [feature + '_posts' for feature in cvec_posts.get_feature_names()])
X_train_posts_df

Unnamed: 0,10_posts,able_posts,able to_posts,about_posts,about it_posts,actually_posts,advice_posts,after_posts,again_posts,ago_posts,...,world_posts,worse_posts,would_posts,would be_posts,wrong_posts,year_posts,years_posts,you_posts,you can_posts,your_posts
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,1,2,0,1
7,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [398]:
X_train_posts_df.shape

(2847, 405)

#### X_train_title_cvec is a sparse matrix

In [32]:
#X_train_title_cvec.data

array([1, 1, 1, ..., 1, 1, 1])

#### Create a dataframe from this sparse matrix with feature names and suffix as text to view it better

In [33]:
# X_train_title_df = pd.DataFrame(X_train_title_cvec.todense(),
#                                columns = [feature + '_title' for feature in cvec_title.get_feature_names()])
# X_train_title_df

Unnamed: 0,advice_title,career_title,company_title,help_title,interview_title,job_title,jobs_title,need_title,new_title,offer_title,want_title,work_title
0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,1,0,0
5,0,0,0,0,0,0,0,0,1,0,0,0
6,1,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,1,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0


In [34]:
# X_train_title_df.shape

(1355, 12)

#### Similarly Create a dataframe of test from sparse matrix with feature names and suffix as text to view it better

In [399]:
X_test_posts_df = pd.DataFrame(X_test_posts_cvec.todense(),
                               columns = [feature + '_posts' for feature in cvec_posts.get_feature_names()])
X_test_posts_df

Unnamed: 0,10_posts,able_posts,able to_posts,about_posts,about it_posts,actually_posts,advice_posts,after_posts,again_posts,ago_posts,...,world_posts,worse_posts,would_posts,would be_posts,wrong_posts,year_posts,years_posts,you_posts,you can_posts,your_posts
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,1,0,...,0,0,2,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,1,0,1,0,0,1,0,2,0,...,0,0,0,0,0,0,0,7,1,0
9,0,0,0,2,1,0,0,1,0,0,...,0,0,2,2,0,0,0,1,0,0


In [400]:
X_test_posts_df.shape

(949, 405)

In [38]:
# X_test_title_df = pd.DataFrame(X_test_title_cvec.todense(),
#                                columns = [feature + '_title' for feature in cvec_title.get_feature_names()])
# X_test_title_df

Unnamed: 0,advice_title,career_title,company_title,help_title,interview_title,job_title,jobs_title,need_title,new_title,offer_title,want_title,work_title
0,0,0,1,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,1,0
8,0,0,0,0,0,1,0,0,0,0,0,0
9,0,0,0,0,1,0,0,0,0,0,0,0


In [40]:
# X_test_title_df.shape

(452, 12)

#### Concat train and test - text and title after vectorization

In [401]:
#reddit_sparse_train_df = pd.concat([X_train_text_df,X_train_title_df],axis=1)
reddit_sparse_train_df = X_train_posts_df

In [402]:
# reddit_sparse_test_df = pd.concat([X_test_text_df,X_test_title_df],axis=1)
reddit_sparse_test_df = X_test_posts_df

#### Shape of final dataframe 

In [234]:
reddit_sparse_train_df.shape

(2847, 693)

In [235]:
reddit_sparse_test_df.shape

(949, 693)

### Using pipeline, GridSearhCV,LogisticRegression

In [403]:
def l_Regression(model_num):
    if model_num == 'l1':
        pipe = Pipeline([
            ('lr',LogisticRegression())
        ])

        pipe_params = {
            'lr__penalty': [model_num], #lasso
            'lr__C': [1, 1.5, 2, 2.5],
            'lr__class_weight': ['balanced'],
            'lr__warm_start': [True, False],
            'lr__random_state': [42],
            'lr__solver': ['liblinear']
        }
    elif model_num == 'l2':
        pipe = Pipeline([
            ('lr',LogisticRegression())
        ])

        pipe_params = {
            'lr__penalty': ['l2'], #ridge
            'lr__C': [1, 1.5, 2, 2.5],
            'lr__class_weight': ['balanced'],
            'lr__warm_start': [True, False],
            'lr__random_state': [42],
            'lr__solver': ['lbfgs', 'liblinear']
        }
    else:
        print('Error due to invalid model number passed. Please check')
        return
    
    gsv = GridSearchCV(pipe,param_grid=pipe_params,cv=5)
    gsv.fit(reddit_sparse_train_df,y_train)
    print('training score {0}: '.format(gsv.score(reddit_sparse_train_df,y_train)))
    print('test score {0}: '.format(gsv.score(reddit_sparse_test_df,y_test)))
    
    predictions = gsv.predict(reddit_sparse_test_df.values)
    print('--------')
    print(confusion_matrix(y_test, predictions))
    print(f'Best params = {gsv.best_params_}')

#### Logistic regression with Lasso

In [404]:
l_Regression('l1')

training score 0.928345626975764: 
test score 0.8672286617492097: 
--------
[[417  43]
 [ 83 406]]
Best params = {'lr__C': 1, 'lr__class_weight': 'balanced', 'lr__penalty': 'l1', 'lr__random_state': 42, 'lr__solver': 'liblinear', 'lr__warm_start': True}


#### Logistic regression with ridge

In [405]:
l_Regression('l2')

training score 0.9392342817000351: 
test score 0.8703898840885143: 
--------
[[419  41]
 [ 82 407]]
Best params = {'lr__C': 1, 'lr__class_weight': 'balanced', 'lr__penalty': 'l2', 'lr__random_state': 42, 'lr__solver': 'lbfgs', 'lr__warm_start': True}
