# Seasoned Onion Data Cleaning/Vectorization


In [1]:
import spacy
import en_core_web_md
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
from bs4 import BeautifulSoup
from contractions import CONTRACTION_MAP
import unicodedata

nlp = en_core_web_md.load(parse=True, tag=True, entity=True)
#nlp_vec = spacy.load('en_vecs', parse = True, tag=True, #entity=True)
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

We now load in all of the web scraped articles in 3 columns, the headline, body text and the corresponding tags

In [2]:
import pandas as pd
import numpy as np

data_df = pd.read_csv("allContentMeta.csv", encoding = "ISO-8859-1")

In [3]:
data_df.rename(columns = {'ï»¿hi':'headline','Body Text':'body', 'Meta Tags':'tags'}, inplace = True) #Weird glitch with excel had to rename column
data_df

Unnamed: 0,headline,body,tags
0,Catcaller Not Sure What The Big Deal Is Since ...,CHICAGO'Urging everyone to lighten up and 'sav...,"vol 56 issue 29, local"
1,"Huh, That's Kind Of Weird,' Thinks Fruit Fly D...","OVERLAND PARK, KS' Observing yet disregarding ...","vol 56 issue 29, animals, local"
2,Man Unaware Majority Of His Life's Failures Di...,"LANSING, MI'Confirming the man's utter oblivio...","vol 56 issue 29, diet, food, health, local"
3,Toddler Feels Somewhat Torn About Pretending T...,"SOUTH BEND, IN'Expressing misgivings over perp...","vol 56 issue 29, kids, police, police brutalit..."
4,Dog Could Really Go For Some Women's Underwear...,"SPRINGFIELD, MO'Unable to control his intense ...","vol 56 issue 28, animals, dogs, local"
...,...,...,...
10077,Friction Must Be Eliminated!,"Toward a Purer Physics If there is a Satan, fe...","opinion, archive, commentary, vol 24 issue 16,..."
10078,Highbrow Student Apartments Offer Authentic Ea...,"Wealthy Tenants Gladly Pay Extra For Filth, Cr...","local, archive, news, vol 24 issue 16, local"
10079,The Onion's Weekly Campus Crime Roundup,Compiled from the logs of University Police & ...,"crime, archive, archive, vol 24 issue 16, the ..."
10080,Loveliest Buses Compete For The Crown In City-...,"ARCOLA, OHIO' The sweet, intoxicating aroma of...","local, archive, news, vol 24 issue 16, local"


We should now remove any Nan values in the data frame. If a Nan values exists in one column for a sample, remove the whole sample

In [4]:
data_df.dropna(subset = ["body"], inplace=True) #Removes Nan values for the Body text column
data_df

Unnamed: 0,headline,body,tags
0,Catcaller Not Sure What The Big Deal Is Since ...,CHICAGO'Urging everyone to lighten up and 'sav...,"vol 56 issue 29, local"
1,"Huh, That's Kind Of Weird,' Thinks Fruit Fly D...","OVERLAND PARK, KS' Observing yet disregarding ...","vol 56 issue 29, animals, local"
2,Man Unaware Majority Of His Life's Failures Di...,"LANSING, MI'Confirming the man's utter oblivio...","vol 56 issue 29, diet, food, health, local"
3,Toddler Feels Somewhat Torn About Pretending T...,"SOUTH BEND, IN'Expressing misgivings over perp...","vol 56 issue 29, kids, police, police brutalit..."
4,Dog Could Really Go For Some Women's Underwear...,"SPRINGFIELD, MO'Unable to control his intense ...","vol 56 issue 28, animals, dogs, local"
...,...,...,...
10077,Friction Must Be Eliminated!,"Toward a Purer Physics If there is a Satan, fe...","opinion, archive, commentary, vol 24 issue 16,..."
10078,Highbrow Student Apartments Offer Authentic Ea...,"Wealthy Tenants Gladly Pay Extra For Filth, Cr...","local, archive, news, vol 24 issue 16, local"
10079,The Onion's Weekly Campus Crime Roundup,Compiled from the logs of University Police & ...,"crime, archive, archive, vol 24 issue 16, the ..."
10080,Loveliest Buses Compete For The Crown In City-...,"ARCOLA, OHIO' The sweet, intoxicating aroma of...","local, archive, news, vol 24 issue 16, local"


In [5]:
data_df.dropna(subset = ["headline"], inplace=True) #Removes Nan values for the Headline column
data_df

Unnamed: 0,headline,body,tags
0,Catcaller Not Sure What The Big Deal Is Since ...,CHICAGO'Urging everyone to lighten up and 'sav...,"vol 56 issue 29, local"
1,"Huh, That's Kind Of Weird,' Thinks Fruit Fly D...","OVERLAND PARK, KS' Observing yet disregarding ...","vol 56 issue 29, animals, local"
2,Man Unaware Majority Of His Life's Failures Di...,"LANSING, MI'Confirming the man's utter oblivio...","vol 56 issue 29, diet, food, health, local"
3,Toddler Feels Somewhat Torn About Pretending T...,"SOUTH BEND, IN'Expressing misgivings over perp...","vol 56 issue 29, kids, police, police brutalit..."
4,Dog Could Really Go For Some Women's Underwear...,"SPRINGFIELD, MO'Unable to control his intense ...","vol 56 issue 28, animals, dogs, local"
...,...,...,...
10077,Friction Must Be Eliminated!,"Toward a Purer Physics If there is a Satan, fe...","opinion, archive, commentary, vol 24 issue 16,..."
10078,Highbrow Student Apartments Offer Authentic Ea...,"Wealthy Tenants Gladly Pay Extra For Filth, Cr...","local, archive, news, vol 24 issue 16, local"
10079,The Onion's Weekly Campus Crime Roundup,Compiled from the logs of University Police & ...,"crime, archive, archive, vol 24 issue 16, the ..."
10080,Loveliest Buses Compete For The Crown In City-...,"ARCOLA, OHIO' The sweet, intoxicating aroma of...","local, archive, news, vol 24 issue 16, local"


In [90]:
data_df.to_csv("dataToLabel.csv") #Not important

We went from 10082 samples down to 6967, quite the drop but it should not be detrimental.

# Data Cleaning

We will be training our AI based on body text. That way the user can input there own shortened body and receive tags based on the text they wrote. We won't need to do much cleaning on the body, but we will need the headlines for later when we actually get some classification. 

Here is a basic data cleaning function/pipeline, I am using re (regular expressions) and the string library. I commented beside each line so it should be self explanatory.

In [6]:
import re
import string

def clean_text_round1(text):
    'Lets lowercase the words, remove puncation and remove any numbers as well'
    text = re.sub(r"http\S+", "", text) #Remove HTTPS links
    text = text.lower() #make all words lowercase
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text) #remove numbers
    text = re.sub('[‘’“”…]', '', text) #remove more puncuation
    text = re.sub('\n', '', text) #remove \n from the text
    text = re.sub("[^0-9a-zA-Z]+", " ", text); #remove special characters
    return text

round1 = lambda x: clean_text_round1(x)

In [21]:
headline_clean = pd.DataFrame(data_df.headline.apply(round1)) #Apply the cleaning onto the headline column

In [22]:
headline_clean

Unnamed: 0,headline
0,catcaller not sure what the big deal is since ...
1,huh thats kind of weird thinks fruit fly divin...
2,man unaware majority of his lifes failures dir...
3,toddler feels somewhat torn about pretending t...
4,dog could really go for some womens underwear ...
...,...
10077,friction must be eliminated
10078,highbrow student apartments offer authentic ea...
10079,the onions weekly campus crime roundup
10080,loveliest buses compete for the crown in cityw...


As you can see above, the headlines are lowercased and cleaned up of many other things; but there is even more we can do to clean the data.

A contraction map is a map of common contractions used in the english language. We use this block of code to change all of the contractions into there not contracted version. I borrowed this function from the following link https://www.kdnuggets.com/2018/08/practitioners-guide-processing-understanding-text-2.html

In [10]:
from contractions import CONTRACTION_MAP
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

contr = lambda x: expand_contractions(x)

The next block is a stemmer. The point of a stemmer is to reduce the overall volume of words in our text, this will be helpful for our classifier later. It will change the words "running", "ran" and "runs" into just run. We will be using a porter stemmer, created by Dr. Martin Porter. You might notice later our output of words may not be semantically correct and not part of the dictionary, but this won't be a problem for us.

In [11]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

stemmer = lambda x: simple_stemmer(x)

A lemmatizer will change our root words into there correct dictionary version.

In [12]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

lem = lambda x: lemmatize_text(x)

We will now clean both the headline data set and body text data set. It is not in a proper pipeline at the moment but this will be fixed later.

In [23]:
headline_clean2 = pd.DataFrame(headline_clean.headline.apply(contr))
headline_clean3 = pd.DataFrame(headline_clean2.headline.apply(stemmer))
headline_clean4 = pd.DataFrame(headline_clean3.headline.apply(lem))

In [14]:
body_clean = pd.DataFrame(data_df.body.apply(round1))
body_clean2 = pd.DataFrame(body_clean.body.apply(contr))
body_clean3 = pd.DataFrame(body_clean2.body.apply(stemmer))
body_clean4 = pd.DataFrame(body_clean3.body.apply(lem))

In [15]:
body_clean4

Unnamed: 0,body
0,chicagourg everyon to lighten up and save the ...
1,overland park ks observ yet disregard the bodi...
2,lans miconfirm the man utter oblivi to the roo...
3,south bend inexpress misgiv over perpetu a sys...
4,springfield mounabl to control hi intens crave...
...,...
10077,toward a pure physic if there be a satan fello...
10078,wealthi tenant gladli pay extra for filth crim...
10079,compil from the log of univers polic secur sun...
10080,arcola ohio the sweet intox aroma of fuel exha...


In [17]:
body_clean4.to_csv("body_clean4.csv")

For cleaning the tags, we won't want to remove as much as we did for the headlines and body text, so we will make a new text cleaning function:

In [18]:
def clean_text_tags(text):
    'Lets lowercase the words, remove puncation and remove any numbers as well'
    text = re.sub('vol', '', text) #remove vol from the text
    text = re.sub('\w*\d\w*', '', text) #remove numbers
    text = re.sub('[‘’“”…]', '', text) #remove more puncuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('issue', '', text) #remove issue from the text
    text = re.sub('archive', '', text) #remove \n from the text
    text = re.sub('entertainment', '', text) #remove \n from the text
    text = re.sub('local', '', text) #remove \n from the text
    text = re.sub('news', '', text) #remove \n from the text
    text = re.sub('in', '', text) #remove \n from the text
    text = re.sub('brief', '', text) #remove \n from the text
    text = re.sub('the', '', text) #remove \n from the text
    text = re.sub('onion', '', text) #remove \n from the text
    return text

round2 = lambda x: clean_text_tags(x)

Words like "news" "brief" and "entertainment" for example were overused in our tags and was saturating our classifier way too much, we will remove those words in our cleaner.

In [19]:
tags_clean = pd.DataFrame(data_df.tags.apply(round2))
tags_clean

Unnamed: 0,tags
0,
1,animals
2,diet food health
3,kids police police brutality
4,animals dogs
...,...
10077,opion commentary omar shenk
10078,
10079,crime
10080,


Now we will bring together all of our clean data frames and remove any more empty values

In [80]:
allData = pd.concat([body_clean4, headline_clean4, tags_clean], axis=1)
allData

Unnamed: 0,body,headline,tags
0,chicagourg everyon to lighten up and save the ...,catcal not sure what the big deal be sinc he d...,
1,overland park ks observ yet disregard the bodi...,huh that kind of weird think fruit fli dive in...,animals
2,lans miconfirm the man utter oblivi to the roo...,man unawar major of hi life failur directli ca...,diet food health
3,south bend inexpress misgiv over perpetu a sys...,toddler feel somewhat torn about pretend to be...,kids police police brutality
4,springfield mounabl to control hi intens crave...,dog could realli go for some woman underwear r...,animals dogs
...,...,...,...
10077,toward a pure physic if there be a satan fello...,friction must be elimin,opion commentary omar shenk
10078,wealthi tenant gladli pay extra for filth crim...,highbrow student apart offer authent eastcoast...,
10079,compil from the log of univers polic secur sun...,the onion weekli campu crime roundup,crime
10080,arcola ohio the sweet intox aroma of fuel exha...,lovely buse compet for the crown in citywid bu...,


In [123]:
allData.to_csv('allData.csv') #Not really important

This line here replaces all empty lines with NaN values because we can't remove empty strings from data frames, we have to convert them. The conversion is quite weird because each empty string is a different size so I had to execute the block increasing the empty space by one every time.

In [100]:
allData['tags'].replace('                   ', np.nan, inplace=True) #The empty ' ' is that long because thats how long the empty strings are for some reason
allData 

Unnamed: 0,body,headline,tags
0,chicagourg everyon to lighten up and save the ...,catcal not sure what the big deal be sinc he d...,
1,overland park ks observ yet disregard the bodi...,huh that kind of weird think fruit fli dive in...,animals
2,lans miconfirm the man utter oblivi to the roo...,man unawar major of hi life failur directli ca...,diet food health
3,south bend inexpress misgiv over perpetu a sys...,toddler feel somewhat torn about pretend to be...,kids police police brutality
4,springfield mounabl to control hi intens crave...,dog could realli go for some woman underwear r...,animals dogs
...,...,...,...
10077,toward a pure physic if there be a satan fello...,friction must be elimin,opion commentary omar shenk
10078,wealthi tenant gladli pay extra for filth crim...,highbrow student apart offer authent eastcoast...,
10079,compil from the log of univers polic secur sun...,the onion weekli campu crime roundup,crime
10080,arcola ohio the sweet intox aroma of fuel exha...,lovely buse compet for the crown in citywid bu...,


In [101]:
allData.dropna(subset = ["tags"], inplace=True) #Removes Nan values for the Body text column
allData

Unnamed: 0,body,headline,tags
1,overland park ks observ yet disregard the bodi...,huh that kind of weird think fruit fli dive in...,animals
2,lans miconfirm the man utter oblivi to the roo...,man unawar major of hi life failur directli ca...,diet food health
3,south bend inexpress misgiv over perpetu a sys...,toddler feel somewhat torn about pretend to be...,kids police police brutality
4,springfield mounabl to control hi intens crave...,dog could realli go for some woman underwear r...,animals dogs
5,charlott ncnote how sad it make her famili to ...,father increasingli virul racism excus becaus ...,racism
...,...,...,...
10075,there be ulterior motiv hide behind mani of th...,saturday morn live be a come masterpiec,opion commentary
10076,it will have page say sourc professor theodor ...,professor to publish book,education
10077,toward a pure physic if there be a satan fello...,friction must be elimin,opion commentary omar shenk
10079,compil from the log of univers polic secur sun...,the onion weekli campu crime roundup,crime


This block of code removes all articles past the 3888th one because the articles past that start becoming very wierd in there headlines and body text, we lose quite alot but going through individually would take a lot of time.

In [103]:
cut_allData = allData.iloc[:3888, :]
cut_allData

Unnamed: 0,body,headline,tags
1,overland park ks observ yet disregard the bodi...,huh that kind of weird think fruit fli dive in...,animals
2,lans miconfirm the man utter oblivi to the roo...,man unawar major of hi life failur directli ca...,diet food health
3,south bend inexpress misgiv over perpetu a sys...,toddler feel somewhat torn about pretend to be...,kids police police brutality
4,springfield mounabl to control hi intens crave...,dog could realli go for some woman underwear r...,animals dogs
5,charlott ncnote how sad it make her famili to ...,father increasingli virul racism excus becaus ...,racism
...,...,...,...
6818,lo angelessay that it time a visibl weari harr...,harrison ford beg agent to just let him die now,celebrities death
6819,lo angelescit flag rate and a desir to retool ...,nbc cancel piven after season,celebrities
6820,lo angelesexplain the exact natur of the situa...,justin timberlak tell jessica biel no one will...,just timberlake
6822,milford deafter sever month spend compil and a...,woman build ironclad case prove mila kuni look...,celebrities


Now we take the above data frame and split it into three seperate ones.

In [104]:
tags_clean = pd.DataFrame(cut_allData.tags)
headline_clean = pd.DataFrame(cut_allData.headline)
body_clean = pd.DataFrame(cut_allData.body)

## Test Headline (This is currently irrelevant but may be of use in the future)

Here I will be creating a new headline to test our classifier in the future!

In [143]:
new_headline = ["Come see what your horoscope says about your food taste!"]
new_headline_df = pd.DataFrame(new_headline, columns = ['headline']) 

In [144]:
new_headline_clean = pd.DataFrame(new_headline_df.headline.apply(round1))
new_headline2 = pd.DataFrame(new_headline_clean.headline.apply(contr))
new_headline3= pd.DataFrame(new_headline2.headline.apply(stemmer))
new_headline4 = pd.DataFrame(new_headline3.headline.apply(lem))
new_headline4

Unnamed: 0,headline
0,come see what your horoscop say about your foo...


In [146]:
new_headlines = headline_clean.append(new_headline4)
new_headlines.tail()

Unnamed: 0,headline
10076,professor to publish book
10077,friction must be elimin
10079,the onion weekli campu crime roundup
10081,your real horoscop
0,come see what your horoscop say about your foo...


# Binary Transformation (Bag of Words Model)

Now that we have 6967 headline and tag samples, we now have to put them in a format where our classifier can actually learn from them. To do this, we have two options. 

We can use One Hot Encoding where every sample will have a 1 or a 0 for every word, depending on if that sample has the word. So our classifier would remeber that if the user feeds me a headline or a tag and it includes the same words as another headline, I will know what to choose. 

Another method is TFxIDF. We give each word for a headline or tag an importance value. But since our headlines are only 10 - 15 words long, we wont exactly need word importance. The better option here may be One Hot Encoding

First, we have to turn our headline data frame into a list so it can be One Hot Encoded

In [147]:
headline_list = headline_clean['headline'].values.tolist()
headline_list

['huh that kind of weird think fruit fli dive in to dish of honey contain corp of other fruit fli',
 'man unawar major of hi life failur directli caus by get onli daili recommend thiamin',
 'toddler feel somewhat torn about pretend to be policeman in current climat',
 'dog could realli go for some woman underwear right now',
 'father increasingli virul racism excus becaus of how difficult it be for him to get out of chair now',
 'gentl whisper of wind through willow dappl of sunlight upon leav unsur what more they have to do to make dipshit look up from phone',
 'hey i think your mute man tell cowork scream fuck you fuck all of you on zoom call',
 'corrupt policeman worri about work with partner who ha never onc take bribe',
 'vacuou fool use wicker and rattan interchang',
 'man do whippet while set off in wood behind harde take moment to reflect on how promis of freedom yet unfulfil',
 'polic depart celebr fourth of juli by use firework for crowd control',
 'yeah we could invit friend

In [105]:
tags_list = tags_clean['tags'].values.tolist()
tags_list

['    animals ',
 '    diet food health ',
 '    kids police police brutality ',
 '    animals dogs ',
 '    racism ',
 '    technology cellphones nature ',
 '    technology workplace coworkers ',
 '    police ',
 '    crafts ',
 '    freedom drugs fast food ',
 '    police protest ',
 '    relationships food friends ',
 '    food vegetarian animals dg alcohol ',
 '    nature outdoors prison ',
 '    relationships teenagers ',
 '    violence lifestyle ',
 '    dogs animals pets ',
 '    family healthcare ',
 '    dg coronavirus ',
 '    playstation  ',
 '    coronavirus ',
 '    food ',
 '    coronavirus ',
 '    coronavirus family grandparents ',
 '    gender ',
 '    protest race george floyd ',
 '    coronavirus ',
 '    kids ',
 '    work ',
 '    coronavirus coworkers ',
 '    dg cookg ',
 '    teenagers ',
 '    coronavirus healthcare ',
 '    family ',
 '    teenagers school coronavirus ',
 '    coronavirus ',
 '    police violence coronavirus ',
 '    education technology coron

In [106]:
body_list = body_clean['body'].values.tolist()
body_list

['overland park ks observ yet disregard the bodi of of hi fellow fli suspend in the tempt but viscou substanc a fruit fli reportedli find itself think huh that kind of weird tuesday as it plung into a dish of honey contain the suffoc corps grant it a littl disturb but it probabl not anyth to worri about say the drosophila melanogast put the somewhat unusu sight of it decea brethren out of it mind as it draw nearer to the mesmer expans of delici glisten honey that certainli not someth you see everi day but so what i be alreadi a week old i can not pretend i know everyth but life too short to be scare of everi new experi sure that a lot of bodi but there also a lot of wonder honey hey look my mom and dad be down there at press time the sink fruit fli mouth have fill up with honey as it vainli attempt to cri out a warn to approach fruit fli',
 'lans miconfirm the man utter oblivi to the root of all hi problem eric shoup reportedli remain unawar that the major of hi life failur be the dire

In [149]:
new_headline_list = new_headlines['headline'].values.tolist()
new_headline_list[-1]

'come see what your horoscop say about your food tast'

In [108]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

In [142]:
#Tags Vectorizer

#Takes the stopwords, min_df ignores term that have a doc frequncy < 3,
#ngram_range(1,1) allows only mono-grams to be taken as a vector

cvec_tags = CountVectorizer(stop_words=None, min_df=100, ngram_range=(1,1)) #This is for the tags vectorization
sf_tags = cvec_tags.fit_transform(tags_list) #returns a transformed version of the tags


In [144]:
#Body Vectorizer
stopwords = nltk.corpus.stopwords.words('english') #default nltk stopwords
RE_stopwords = r'\b(?:{})\b'.format('|'.join(stopwords)) #Also use RegEx stop words, wont use this though

cvec_body = CountVectorizer(stop_words=stopwords, min_df=100, max_df=0.5, ngram_range=(1,1)) 
sf_body = cvec_body.fit_transform(body_list) #returns a transformed version of the headlines


In [153]:
#Headline Vectorizer
stopwords = nltk.corpus.stopwords.words('english') #default nltk stopwords
RE_stopwords = r'\b(?:{})\b'.format('|'.join(stopwords)) #Also use RegEx stop words, wont use this though

cvec_headlines = CountVectorizer(stop_words=stopwords, min_df=3, ngram_range=(1,1)) 
sf_headlines = cvec_headlines.fit_transform(headline_list) #returns a transformed version of the headlines


In [155]:
#New Headline Vectorizer (not relevant)
stopwords = nltk.corpus.stopwords.words('english') #default nltk stopwords
RE_stopwords = r'\b(?:{})\b'.format('|'.join(stopwords)) #Also use RegEx stop words, wont use this though

cvec_new_headlines = CountVectorizer(stop_words=stopwords, min_df=3, ngram_range=(1,1)) 
sf_new_headlines = cvec_new_headlines.fit_transform(new_headline_list) #returns a transformed version of the headlines


In [154]:
#Headline One-Hot Encoder
from sklearn.preprocessing import Binarizer

onehot = Binarizer()
sf_headlines = onehot.fit_transform(sf_headlines)

data_oh = pd.DataFrame(sf_headlines.toarray(), columns=cvec_headlines.get_feature_names())
data_oh.to_csv('headline_onehot_final.csv')

In [143]:
#Tag One-Hot Encoder
from sklearn.preprocessing import Binarizer
onehot = Binarizer()
sf_tags = onehot.fit_transform(sf_tags)

data_oh = pd.DataFrame(sf_tags.toarray(), columns=cvec_tags.get_feature_names())
data_oh.to_csv('tags_onehot_final3.csv')

In [145]:
#Body One-Hot Encoder
onehot = Binarizer()
sf_body = onehot.fit_transform(sf_body)

data_oh = pd.DataFrame(sf_body.toarray(), columns=cvec_body.get_feature_names())
data_oh.to_csv('body_onehot_final2.csv')

In [156]:
#New Headline One-Hot Encoder (not relevant)
onehot = Binarizer()
sf_new_headlines = onehot.fit_transform(sf_new_headlines)

data_oh = pd.DataFrame(sf_new_headlines.toarray(), columns=cvec_new_headlines.get_feature_names())
data_oh.to_csv('new_headline_onehot_final.csv')

Now we can move on to the AI classification part