In [75]:
import pandas as pd
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yolandaferreirofranchi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [76]:
filename = 'articles.pkl'

article_df = pd.read_pickle(filename)
article_df = article_df.assign(Article_Number=range(len(article_df)))
article_df = article_df.reset_index()
article_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70 entries, 0 to 69
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   index           70 non-null     int64 
 1   title           70 non-null     object
 2   link            70 non-null     object
 3   keywords        40 non-null     object
 4   creator         60 non-null     object
 5   video_url       0 non-null      object
 6   description     66 non-null     object
 7   content         65 non-null     object
 8   pubDate         70 non-null     object
 9   image_url       35 non-null     object
 10  source_id       70 non-null     object
 11  category        70 non-null     object
 12  country         70 non-null     object
 13  language        70 non-null     object
 14  Article_Number  70 non-null     int64 
dtypes: int64(2), object(13)
memory usage: 8.3+ KB


In [77]:
# tokenize sentences in an article 
def split_sentences(article, article_id):
    sentences = nltk.sent_tokenize(article)
    sentences_with_id = [(sentence, article_id) for sentence in sentences]
    return sentences_with_id

sentences_list = []

# add sentences to a new DF along with article ID 
for article, article_id in article_df[['content','Article_Number']].values:
    sentences = split_sentences(str(article), article_id)
    sentences_list.extend(sentences)

sentences_df = pd.DataFrame(sentences_list, columns= ['sentences', 'article_id'])

# TO-DO: add the data & find the pronouns to determine gender

In [78]:
sentences_df

Unnamed: 0,sentences,article_id
0,Do you want to know who the father of Ari Flet...,0
1,"Yes, we’re talking about the social media star...",0
2,"On Valentine’s Day, he told her that he wants ...",0
3,But the question is who Ari’s child’s father is.,0
4,"Before we talk about that, let’s quickly talk ...",0
...,...,...
4002,"He has covered stories such as Watergate, the ...",69
4003,He is also the author of five bestselling nove...,69
4004,"He has three children, lives in rural Pennsylv...",69
4005,You can read his daily columns at luciantrusco...,69


In [None]:
#sentences_df.drop(['female_count', 'male_count'], axis = 1)

In [None]:
his_w = [' his ', ' he ']
his_w_expand = ["his", "he", "man", "uncle", "dad", "daddy", "father", "boy", "husband"]
her_w = [" her ", " she "]
her_w_expand = ["her", "she", "woman", "aunt", "mother","mom", "mommy", "girl", "wife"]


def pronoun_occurance(text, female_list, male_list):
    female_count = sum(text.count(word) for word in female_list)
    male_count = sum(text.count(word) for word in male_list)
    return female_count, male_count

#sentences_df['female_count'], sentences_df['male_count'] = zip(sentences_df['sentences'].apply(pronoun_occurance, female_list=her_w, male_list=his_w))


In [79]:
#this solution fixes the bug of double counting 
def pronoun_occurances(text):
    """ This function will count the number of female and male pronoun occurences in a given sentence. 
    We will need to update the REGEX in order to incorporate more pronouns if we need to"""
    pattern_m = r'(\s|^)(he|his)\b' #this regex will capture he/his as standalone words within a string but also at beginning of sentence
    matches_m = re.findall(pattern_m, text, re.IGNORECASE) #IGNORECASE is necessary to make sure that it picks up the pronouns at the beginning of a sentence
    pattern_f = r'(\s|^)(she|her)\b'
    matches_f = re.findall(pattern_f, text, re.IGNORECASE)
    count_m = len(matches_m)
    count_f = len(matches_f)
    return count_f, count_m

In [80]:
#create a variable applying the function of pronoun occurence
sent = sentences_df['sentences'].apply(pronoun_occurances)
# Create two new columns in sentences DF from the tuple output in "sent"
sentences_df['female_count'] = [x[0] for x in sent]
sentences_df['male_count']= [x[1] for x in sent]

#Bug is fixed and now it counts properly
sentences_df

Unnamed: 0,sentences,article_id,female_count,male_count
0,Do you want to know who the father of Ari Flet...,0,0,0
1,"Yes, we’re talking about the social media star...",0,0,0
2,"On Valentine’s Day, he told her that he wants ...",0,3,2
3,But the question is who Ari’s child’s father is.,0,0,0
4,"Before we talk about that, let’s quickly talk ...",0,0,0
...,...,...,...,...
4002,"He has covered stories such as Watergate, the ...",69,0,1
4003,He is also the author of five bestselling nove...,69,0,1
4004,"He has three children, lives in rural Pennsylv...",69,0,2
4005,You can read his daily columns at luciantrusco...,69,0,1


In [81]:
sentences_df.loc[[7]] #works

Unnamed: 0,sentences,article_id,female_count,male_count
7,Did you know that she also runs her own business?,0,2,0


In [82]:
def compare_count(male_col, female_col): 
    """This function compares the count of female to male pronouns. It will output "1" if male count bigger
    than female count, "neutral" if the count is equal, and "female" if there is a higher female count)"""
    if male_col > female_col: 
        return "1"
    elif male_col < female_col: 
        return "0"
    else: 
        return "0.5"

sentences_df['col_type'] = sentences_df.apply(lambda row: compare_count(row['male_count'], row['female_count']), axis=1)
sentences_df

Unnamed: 0,sentences,article_id,female_count,male_count,col_type
0,Do you want to know who the father of Ari Flet...,0,0,0,0.5
1,"Yes, we’re talking about the social media star...",0,0,0,0.5
2,"On Valentine’s Day, he told her that he wants ...",0,3,2,0
3,But the question is who Ari’s child’s father is.,0,0,0,0.5
4,"Before we talk about that, let’s quickly talk ...",0,0,0,0.5
...,...,...,...,...,...
4002,"He has covered stories such as Watergate, the ...",69,0,1,1
4003,He is also the author of five bestselling nove...,69,0,1,1
4004,"He has three children, lives in rural Pennsylv...",69,0,2,1
4005,You can read his daily columns at luciantrusco...,69,0,1,1


**Building the Classifier**

In [83]:
sentences_df.head(25)

Unnamed: 0,sentences,article_id,female_count,male_count,col_type
0,Do you want to know who the father of Ari Flet...,0,0,0,0.5
1,"Yes, we’re talking about the social media star...",0,0,0,0.5
2,"On Valentine’s Day, he told her that he wants ...",0,3,2,0.0
3,But the question is who Ari’s child’s father is.,0,0,0,0.5
4,"Before we talk about that, let’s quickly talk ...",0,0,0,0.5
5,Ari Fletcher is not only a famous person on th...,0,0,1,1.0
6,"Ari was born in Chicago on July 12, 1995.",0,0,0,0.5
7,Did you know that she also runs her own business?,0,2,0,0.0
8,She is the owner of the hair product company .,0,1,0,0.0
9,"After that, Ari mostly posts videos and pictur...",0,1,0,0.0


In [None]:
#I think with the above solution we wouldn't need this function 
def count_words(text, word_list):
    return sum(text.count(word) for word in word_list)

sentences_df['male_count2'] = sentences_df['sentences'].apply(count_words, word_list=his_w)
sentences_df['female_count2'] = sentences_df['sentences'].apply(count_words, word_list=her_w)


In [None]:
pd.set_option('display.max_colwidth', 1000)
sentences_df.loc[[7]]

#there is a bug here. It weirdly seems to be double counting? The zip function is new to me though so maybe thats the problem. 

In [None]:
df = pd.DataFrame({'text': ['This is a sample text', 'Another text example', 'One more example']})

# define two lists of specific words to count
word_list1 = ['text', 'example']
word_list2 = ['is', 'more']

def count_words(text, word_list):
    return sum(text.count(word) for word in word_list)

# use apply() to add two new columns with the counts of the specific words in each list
df['word_count1'] = df['text'].apply(count_words, word_list=word_list1)
df['word_count2'] = df['text'].apply(count_words, word_list=word_list2)

# print the resulting DataFrame
print(df)

