In [None]:
import pandas as pd
import nltk
import re
import numpy as np
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#Json file option
filejson = "C:/Users/danie/Desktop/bbc_news_list_uk.json"
filecsv = "C:/Users/danie/Desktop/bbc_news_list_uk.csv"
article_df = pd.read_csv(filecsv)
article_df = article_df.assign(Article_Number=range(len(article_df)))
article_df = article_df.reset_index()
article_df.info()

In [None]:
#this code assumes the first four digits are the year. can be changed for last of middle
year = article_df['news_post_date'].str[:4]
article_df['year']=year

In [None]:
#filename = 'articles.pkl'

#article_df = pd.read_pickle(filename)
#article_df = article_df.assign(Article_Number=range(len(article_df)))
#article_df = article_df.reset_index()
#article_df.info()

In [None]:
# tokenize sentences in an article 
import re

def split_sentences(article, article_id, year):
    pattern = r'(?<=[a-z0-9"]) *[.?!] *(?=[A-Z])'
    article = re.sub(pattern, r'\g<0> ', article)
    sentences = nltk.sent_tokenize(article)
    sentences_with_id = [(sentence, article_id, year) for sentence in sentences]
    return sentences_with_id

sentences_list = []

# add sentences to a new DF along with article ID 
for article, article_id, year in article_df[['content','Article_Number', 'year']].values:
    sentences = split_sentences(str(article), article_id, year)
    sentences_list.extend(sentences)

sentences_df = pd.DataFrame(sentences_list, columns= ['sentences', 'article_id', 'year'])

In [None]:
def pronoun_occurances(text):
    """ This function will count the number of female and male pronoun occurences in a given sentence. 
    We will need to update the REGEX in order to incorporate more pronouns if we need to"""
    pattern_m = r'(\s|^)(he|his|him|he\'s|hes)\b' #this regex will capture he/his as standalone words within a string but also at beginning of sentence
    matches_m = re.findall(pattern_m, text, re.IGNORECASE) #IGNORECASE is necessary to make sure that it picks up the pronouns at the beginning of a sentence
    pattern_f = r'(\s|^)(she|her|hers|shes|she\'s)\b'
    matches_f = re.findall(pattern_f, text, re.IGNORECASE)
    count_m = len(matches_m)
    count_f = len(matches_f)
    return count_f, count_m

In [None]:
#create a variable applying the function of pronoun occurence
sent = sentences_df['sentences'].apply(pronoun_occurances)
# Create two new columns in sentences DF from the tuple output in "sent"
sentences_df['female_count'] = [x[0] for x in sent]
sentences_df['male_count']= [x[1] for x in sent]

#Bug is fixed and now it counts properly
sentences_df

In [None]:
def compare_count(male_col, female_col): 
    """This function compares the count of female to male pronouns. It will output "1" if male count bigger
    than female count, "neutral" if the count is equal, and "female" if there is a higher female count. 
    The function returns strings because we need categorical variables for log reg to run"""
    if male_col > female_col: 
        return "2"
    elif male_col == female_col: 
        return "1"
    else:
        return "0"

sentences_df['col_type'] = sentences_df.apply(lambda row: compare_count(row['male_count'], row['female_count']), axis=1)
sentences_df



**Building the Classifier**

**Logistics Regression Classifier**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics 
from nltk.corpus import stopwords

In [None]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [None]:
#Sentence Encoding

sentences_df['encoded_sentences'] = sentences_df.loc[:,'sentences']

def tidy_text(sentence, remove_stopwords = True):

    sentence = re.sub(r'https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
    sentence = re.sub(r'\<a href', ' ', sentence)
    sentence = re.sub(r'&amp;', '', sentence) 
    sentence = re.sub("\d+", "", sentence)
        #changed the number detection code
    sentence = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', sentence)
    sentence = re.sub(r'<br />', ' ', sentence)
    sentence = re.sub(r'\'', ' ', sentence)

        # Tokenize each word
    sentence =  nltk.WordPunctTokenizer().tokenize(sentence)

    nltk.tag.pos_tag(sentence.split())
    tagged_sentence = nltk.tag.pos_tag(sentence.split())
    edited_sentence = [word for word,tag in tagged_sentence if tag != 'NNP' and tag != 'NNPS']
    sentence = (' '.join(edited_sentence))

    # Convert words to lower case
    sentence = sentence.lower()
    
    # Expand contractions
    if True:
        sentence = sentence.split()
        new_text = []
        for word in sentence:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        sentence = " ".join(new_text)
    
    
    # remove stopwords
    if remove_stopwords:
        sentence = sentence.split()
        stops = set(stopwords.words("english"))
        sentence = [w for w in sentence if not w in stops]
        sentence = " ".join(sentence)


    
    # Lemmatize each token
    lemma = nltk.stem.WordNetLemmatizer()
    sentence = [lemma.lemmatize(word) for word in sentence]
    return sentence

    #Maybe we should remove names? At least (could just be proper nouns)

sentences_df['encoded_sentences'] = sentences_df['encoded_sentences'].apply(tidy_text)

In [60]:
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
#Sentence Encoding

def tidy_text(sentence, remove_stopwords = True):

    sentence = re.sub(r'https?:\/\/.*[\r\n]*', '', sentence)
    sentence = re.sub(r'\<a href', ' ', sentence)
    sentence = re.sub(r'&amp;', '', sentence) 
    sentence = re.sub("\d+", "", sentence)
        #changed the number detection code
    sentence = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', sentence)
    sentence = re.sub(r'<br />', ' ', sentence)
    sentence = re.sub(r'\'', ' ', sentence)

        # Tokenize each word
    sentence =  nltk.WordPunctTokenizer().tokenize(sentence)

    nltk.tag.pos_tag(sentence)
    tagged_sentence = nltk.tag.pos_tag(sentence)
    lemma = nltk.stem.WordNetLemmatizer()

    lemma_tags = ('NN', 'VB', 'JJ', 'RB')
    pn_tags = ('NNP', 'NNPS')

    def POS_tag_lookup(tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:         
            return None

    new_words = [] 
    for word, tag in tagged_sentence:
        print("enter loop",word, tag)
        if tag not in pn_tags:
            if tag in lemma_tags:
                new_tag = POS_tag_lookup(tag)
                print("new tag", new_tag)
                new_word = (lemma.lemmatize(word, new_tag))
                print("lematized word with POS", new_word)
                new_words.append(new_word)
                print("lemma", new_words)
            else:
                new_words.append(word)

    sentence = new_words


    #sentence = [word for word,tag in tagged_sentence if tag != 'NNP' and tag != 'NNPS']
    #print(sentence)


    #sentence = [lemma.lemmatize(word) for word in sentence]
    #return sentence

    # Expand contractions
    if True:
        new_text = []
        for word in sentence:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
    
    
    # remove stopwords
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        sentence = [w for w in sentence if not w in stops]
    
    return sentence



text = "the bloodied person saw some blood drawing draws drew draw running runs run change changing changes change Professor John in Denmark with Leo"
tidy_text(text)


enter loop the DT
enter loop bloodied JJ
new tag a
lematized word with POS bloodied
lemma ['the', 'bloodied']
enter loop person NN
new tag n
lematized word with POS person
lemma ['the', 'bloodied', 'person']
enter loop saw VBD
enter loop some DT
enter loop blood NN
new tag n
lematized word with POS blood
lemma ['the', 'bloodied', 'person', 'saw', 'some', 'blood']
enter loop drawing VBG
enter loop draws JJ
new tag a
lematized word with POS draws
lemma ['the', 'bloodied', 'person', 'saw', 'some', 'blood', 'drawing', 'draws']
enter loop drew JJ
new tag a
lematized word with POS drew
lemma ['the', 'bloodied', 'person', 'saw', 'some', 'blood', 'drawing', 'draws', 'drew']
enter loop draw NN
new tag n
lematized word with POS draw
lemma ['the', 'bloodied', 'person', 'saw', 'some', 'blood', 'drawing', 'draws', 'drew', 'draw']
enter loop running VBG
enter loop runs NNS
enter loop run VBP
enter loop change NN
new tag n
lematized word with POS change
lemma ['the', 'bloodied', 'person', 'saw', 'som

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


['bloodied',
 'person',
 'saw',
 'blood',
 'drawing',
 'draws',
 'drew',
 'draw',
 'running',
 'runs',
 'run',
 'change',
 'changing',
 'changes',
 'change']

In [None]:
#lemma test

text = "the bloodied person saw some blood drawing draws drew draw running runs run change changing changes change"
nltk.tag.pos_tag(text.split())
tagged_sentence = nltk.tag.pos_tag(text.split())

tag_dict = {"J": wordnet.ADJ, #adjective
    "N": wordnet.NOUN,#noun
    "V": wordnet.VERB,#verb
    "R": wordnet.ADV} #adverb

return tag_dict.get(tag, wordnet.NOUN)


def lemmatize_text(text):
  text=[WordNetLemmatizer().lemmatize(w, get_pos_tags(w)) for w in text]   
  return text

final_output=lemmatize_text(example)
print (final_output)
    
    edited_sentence = [word for word,tag in tagged_sentence if tag != 'NNP' and tag != 'NNPS']
    sentence = (' '.join(edited_sentence))

text =  nltk.WordPunctTokenizer().tokenize(text)
lemma = nltk.stem.WordNetLemmatizer()
sentence = [lemma.lemmatize(word) for word in text]
sentence

In [None]:
pd.get_dummies(sentences_df.year)
rated_dummies = pd.get_dummies((sentences_df).year)
sentences_df = pd.concat([sentences_df, rated_dummies], axis=1)

In [None]:
#X = sentences_df[['encoded_sentences', '2010', '2012']]
#y = sentences_df["col_type"]
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X = sentences_df['encoded_sentences']
y = sentences_df['col_type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#vectorizer = TfidfVectorizer(max_features= 1000, lowercase=False, tokenizer=False)
def fake(token):
    return token

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=fake,
    preprocessor=fake,
    token_pattern=None)  


X_train = tfidf.fit_transform(X_train)
X_test = tfidf.fit_transform(X_test)

In [None]:
#multinomial logistic regression 
logreg = LogisticRegression(multi_class = "multinomial", solver = "lbfgs", max_iter= 5000) #classifier 
logreg.fit(X_train, y_train) #fit the model 
logreg.score(X_train, y_train) #get accuracy

In [None]:
coefs = logreg.coef_[0]
#male is 2 and female is 0

sorted_coef = sorted((zip(tfidf.get_feature_names(), coefs)), key = lambda x: x[1], reverse=True)

high_coef = sorted_coef[:10]
low_coef = sorted_coef[-10:]

print("highest coefs")
for i in high_coef: 
    print(i)

In [None]:
print("lowest coefs")
for i in low_coef: 
    print(i)

In [None]:
y

In [None]:
pd.DataFrame(zip(X_train.columns, np.transpose(abs(logreg.coef_))), columns=['features', 'coef']) #use absolute values to identify biggest coeffs

Interestingly, male coefficient for neutral is higher than the female. Otherwise, as expected, the coefficients for female and male each are correspondingly high for each gender. 

In [None]:
prediction = logreg.predict(X_test) #predict test 
print(metrics.accuracy_score(y_test, prediction)) #accuracy 
print(metrics.confusion_matrix(y_test, prediction)) #confusion matrix
print(metrics.classification_report(y_test, prediction))

In [None]:
#I think with the above solution we wouldn't need this function 
def count_words(text, word_list):
    return sum(text.count(word) for word in word_list)

sentences_df['male_count2'] = sentences_df['sentences'].apply(count_words, word_list=his_w)
sentences_df['female_count2'] = sentences_df['sentences'].apply(count_words, word_list=her_w)


In [None]:
pd.set_option('display.max_colwidth', 1000)
sentences_df.loc[[7]]

#there is a bug here. It weirdly seems to be double counting? The zip function is new to me though so maybe thats the problem. 

In [None]:
df = pd.DataFrame({'text': ['This is a sample text', 'Another text example', 'One more example']})

# define two lists of specific words to count
word_list1 = ['text', 'example']
word_list2 = ['is', 'more']

def count_words(text, word_list):
    return sum(text.count(word) for word in word_list)

# use apply() to add two new columns with the counts of the specific words in each list
df['word_count1'] = df['text'].apply(count_words, word_list=word_list1)
df['word_count2'] = df['text'].apply(count_words, word_list=word_list2)

# print the resulting DataFrame
print(df)

