In [51]:
# import necessary libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MaxAbsScaler, StandardScaler, FunctionTransformer
from sklearn.decomposition import TruncatedSVD

from nltk import pos_tag
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk
import textstat

import re
from itertools import combinations

import cloudpickle
import pickle


def merge_two_dicts(d1, d2):
            dm = d1.copy()
            dm.update(d2)
            return dm
        
# two vectorizers to loop through
# Count Vectorizer with Lemmatizer
lemmatizer = WordNetLemmatizer()
analyzer = CountVectorizer().build_analyzer()

def lemmatized_words(words):
    return (lemmatizer.lemmatize(w) for w in analyzer(words))

model = pickle.load(open('finalized_model.pkl', 'rb'))

test_str = "I had 2 cups of water just now"
sentence = [test_str]
sentence_df = pd.DataFrame(sentence, columns = ["title"])

def sentence_length(row):
    # Using regex to find all words, including those with quotes such as what's
    split_str = re.findall(r"[\w+|\w+\'\w+]+", row['title'])
    row['title_length'] = len(split_str)
    return row

def word_length(row):
    # Using regex to find all words, including those with quotes such as what's
    split_str = re.findall(r"[\w+|\w+\'\w+]+", row['title'])
    num_words = row['title_length']
    total_len = sum([len(w) for w in split_str])
    row['average_length'] = total_len/num_words
    return row

def word_tagger(row):
    tagged_list = pos_tag(word_tokenize(row['title']))
    # get all tags
    tags_tuples = [w[1] for w in tagged_list]
    # get unique tags and count
    tags, counts = np.unique(tags_tuples, return_counts=True)
    for i, tag in enumerate(tags):
        row[tag] = counts[i]
    return row

# check if columns in the list of word tag names, if not add column and give 0 as value
def empty_col(row):
    word_tag_cols = ['#', '$', "''", '(', ')', ',', '.', ':', 'CC', 'CD', 'DT', 'EX', 'FW',
    'IN', 'JJ', 'JJR', 'JJS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT',
    'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB',
    'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', "``"]
    for col in word_tag_cols:
        if col not in row.index:
            row[col] = 0
    return row
    

sentence_df = sentence_df.apply(sentence_length, axis = 1)
sentence_df = sentence_df.apply(word_length, axis = 1)
sentence_df = sentence_df.apply(word_tagger, axis = 1)
sentence_df = sentence_df.apply(empty_col, axis = 1)

predicted_subreddit_num = model.predict(sentence_df)
#predicted_subreddit = 'Profound' if predicted_subreddit_num == 1 else "Not profound"
#predicted_subreddit
predicted_subreddit_num

array([0], dtype=int64)

In [43]:
df_test = pd.read_csv("../data/boring_tweets.csv")
df_test = df_test.reset_index(drop = True)
df_test = df_test.iloc[0:2,:]
df_test = pd.DataFrame(df_test.values, columns = ["title"])
df_test

Unnamed: 0,title
0,Retweet if you're sitting down.
1,I've been asleep but now I'm awake.


In [44]:
df_test = df_test.apply(sentence_length, axis = 1)
df_test = df_test.apply(word_length, axis = 1)
df_test = df_test.apply(word_tagger, axis = 1)
df_test = df_test.apply(empty_col, axis = 1)
df_test = df_test.fillna(0)
df_test

Unnamed: 0,.,CC,IN,JJ,NN,PRP,RB,RP,VBG,VBN,...,TO,UH,VB,VBD,VBZ,WDT,WP,WP$,WRB,``
0,1,0.0,1.0,0.0,1.0,1,0.0,1.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1,1.0,0.0,2.0,0.0,2,1.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
df_test.columns

Index(['.', 'CC', 'IN', 'JJ', 'NN', 'PRP', 'RB', 'RP', 'VBG', 'VBN', 'VBP',
       'average_length', 'title', 'title_length', '#', '$', '''', '(', ')',
       ',', ':', 'CD', 'DT', 'EX', 'FW', 'JJR', 'JJS', 'MD', 'NNP', 'NNPS',
       'NNS', 'PDT', 'POS', 'PRP$', 'RBR', 'RBS', 'SYM', 'TO', 'UH', 'VB',
       'VBD', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '``'],
      dtype='object')

In [46]:
predicted_subreddit_num = model.predict(df_test)
predicted_subreddit_num

ValueError: For a sparse output, all columns should be a numeric or convertible to a numeric.