In [2]:
import pandas as pd
import time 
import pickle
import os
#nltk.download('punkt')


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import warnings 
warnings.filterwarnings("ignore")

In [2]:
year = "2010"
file_path = "/Users/yolandaferreirofranchi/Desktop/ThesisDatasets/"
file_path_2 = "_final_rnn.pickle"

df_10= pd.read_pickle(file_path + year + file_path_2)

In [3]:
def absolute_count(male_col, female_col):
    if female_col > male_col and male_col == 0:
        return 1
    elif male_col> female_col and female_col ==0: 
        return 0
    else: 
        return None

In [4]:
#apply function to only get rows with an absolute count 
df_10['col_type'] = df_10.apply(lambda row: absolute_count(row['male_count'], row['female_count']),axis=1)

#remove nulls 
df_10 = df_10[df_10["col_type"].notnull()]

#DOC: number of male and female columns
df_10["col_type"].value_counts()  

0.0    378476
1.0    113870
Name: col_type, dtype: int64

**Define TFIDF Vectorizer**

In [10]:
#tfidf vectorizer
def fake(token):
    return token

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=fake,
    preprocessor=fake,
    token_pattern=None)  

**Logistic Regression Classifier**

*What the LR model does-* LR estimates the probability of an instance belonging to the positive class. 

In [12]:
def logistic_regression_year(df, target_col, text_col):
    #start timer 
    start_time = time.time()
    
    #split data 
    X = df[text_col].apply(lambda x: str(x))
    y = df[target_col]

    #train test split
    tfidf = TfidfVectorizer()
    X_transformed = tfidf.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

    # define the hyperparameters to search over
    param_grid = {
        'penalty': ['l1', 'l2'],
        'C': [0.1, 1, 10, 100],
        'solver': ['lbfgs', 'newton-cg', 'sag' 'saga'], #removed liblinear as it is for small + medium datasets & NOT for sparse data
        'class_weight': ['balanced', {0: 0.3, 1: 0.7}],
        'random_state': [42]
    }

    #the classifier 
    clf = LogisticRegression()

    #create a GridsearchCV object 
    grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_clf = LogisticRegression(**grid_search.best_params_)

    #run the classifier 
    best_clf.fit(X_train, y_train)
    y_pred = best_clf.predict(X_test)

    #performance 
    accuracy = best_clf.score(X_test, y_test) #evaluate on test set
    class_report = classification_report(y_test, y_pred, zero_division = 0)
    #results = {'accuracy': accuracy, 'classification_report': class_report}
    #print(f"Accuracy: {accuracy:.2f}")
    #print(f"Classification Report:\n{class_report}")

    #coefficients
    coefs = best_clf.coef_[0]
    sorted_coef = sorted((zip(tfidf.get_feature_names_out(), coefs)), key = lambda x: x[1], reverse=True)
    high_coef = sorted_coef[:1000]
    low_coef = sorted_coef[-1000:]
    
    df_high_coef = pd.DataFrame(high_coef, columns=['feature', 'coef'])
    df_low_coef = pd.DataFrame(low_coef, columns=['feature', 'coef'])

    #save model 
    with open('results.pkl', 'wb') as f:
        pickle.dump({'model': best_clf, 'tfidf': tfidf, 'accuracy': accuracy, 'report': class_report}, f)

    #end timer 
    end_time = time.time()
    print(f"\nExecution time: {end_time - start_time:.2f} seconds")

    return df_high_coef, df_low_coef, best_clf, class_report #df_probs_top

**Coefficient Analysis**

In [7]:
df_probs_10 = logistic_regression_year(df_10, 'col_type', 'pre_processed_sent')
df_probs_10


Execution time: 525.98 seconds


(       feature      coef
 0     pregnant  3.186218
 1         baby  3.085429
 2        child  2.502925
 3    boyfriend  2.471323
 4          rap  2.215964
 ..         ...       ...
 995     openly  0.201524
 996     duress  0.201428
 997   campaign  0.201339
 998      nanny  0.200883
 999      susan  0.200859
 
 [1000 rows x 2 columns],
         feature      coef
 0     reinstate -0.213403
 1          bird -0.214428
 2        dinghy -0.214939
 3         evade -0.215387
 4       embassy -0.215427
 ..          ...       ...
 995    football -1.268172
 996  girlfriend -1.327853
 997       shoot -1.620356
 998     soldier -1.622398
 999      arrest -1.826027
 
 [1000 rows x 2 columns],
 LogisticRegression(C=0.1, class_weight={0: 0.3, 1: 0.7}, random_state=42,
                    solver='newton-cg'),
 '              precision    recall  f1-score   support\n\n         0.0       0.81      0.90      0.85     75430\n         1.0       0.48      0.30      0.37     23040\n\n    accuracy         

*Interpreting Performance*

The LR model is WAY better in terms of precision, recall, and f1-score at predicting the negative class - i.e. male. 

In [8]:
#open model performance metrics 
with open('results_10.pkl', 'rb') as f:
    results = pickle.load(f)

accuracy = results['accuracy']
report = results['report']

print(f"Accuracy: {accuracy:.2f}")
print(f"Classification report:\n{report}")

Accuracy: 0.76
Classification report:
              precision    recall  f1-score   support

         0.0       0.81      0.90      0.85     75430
         1.0       0.48      0.30      0.37     23040

    accuracy                           0.76     98470
   macro avg       0.64      0.60      0.61     98470
weighted avg       0.73      0.76      0.74     98470



In [None]:
#create DF of highest coef
highest_coef_22 = pd.DataFrame(df_probs_22[0])
highest_coef_22["coef_type"] = "highest"
highest_coef_22["year"] = year

#create DF of lowest lowest coef manipulation 
lowest_coef_22 = pd.DataFrame(df_probs_22[1]) 
lowest_coef_22 = lowest_coef_22.sort_values(by = ["coef"], ascending = True).reset_index(drop = True) #absolute lowest value 
lowest_coef_22["coef_type"] = "lowest" #coef type
lowest_coef_22["year"] = year #year 
highest_coef_22

In [None]:
#save DF as pickle file per year 
lowest_coef_22.to_pickle('RESULTS22_coef_low.pickle')
highest_coef_22.to_pickle('RESULTS22_coef_high.pickle')

**Interpretation of coefficient results**
"winner" is one of the independent variables in the model and its coefficient value is 1.2865737872946597. This means that a one unit increase in the value of the "winner" variable will increase the log-odds of the positive class (e.g. "female" if the logistic regression model is binary and predicting gender) by the corresponding coefficient value, while holding all other variables constant.

**Interpretation of the predicted probabilities in LR:** 
The predicted probabilities of the logistic regression model tell us the probability that the input data belongs to the positive class - in this case the female class as we attributed it a value = 1 in binary log reg. Hence, for each word, we get a list of a word/feature and the probability that it is female. 

These predicted probabilities can be interpreted as the confidence level of the model in its prediction. For example, a predicted probability of 0.8 for a positive class means that the model is 80% confident that the sample belongs to the positive class. 

In [None]:
def load_pickle_files_low(directory):
    objects = []
    for filename in os.listdir(directory):
        if filename.endswith("low.pickle"):
            with open(os.path.join(directory, filename), 'rb') as file:
                obj = pickle.load(file)
                objects.append(obj)
    return objects

def load_pickle_files_high(directory):
    objects = []
    for filename in os.listdir(directory):
        if filename.endswith("high.pickle"):
            with open(os.path.join(directory, filename), 'rb') as file:
                obj = pickle.load(file)
                objects.append(obj)
    return objects

In [None]:
#stack dfs together for low and high coefs 
df_low = load_pickle_files_low(r"/Users/yolandaferreirofranchi/Documents/GitHub/Masters-Thesis")
df_high = load_pickle_files_high(r"/Users/yolandaferreirofranchi/Documents/GitHub/Masters-Thesis")
df_high

**Decade Long LR Model**

In [3]:
file_path = "/Users/yolandaferreirofranchi/Desktop/ThesisDatasets/"
file_path_2 = "_final_rnn.pickle"

df_10= pd.read_pickle(file_path + "2010" + file_path_2)
df_11= pd.read_pickle(file_path + "2011" + file_path_2)
df_12= pd.read_pickle(file_path + "2012" + file_path_2)
df_13= pd.read_pickle(file_path + "2013" + file_path_2)
df_14= pd.read_pickle(file_path + "2014" + file_path_2)
df_15= pd.read_pickle(file_path + "2015" + file_path_2)
df_16= pd.read_pickle(file_path + "2016" + file_path_2)
df_17= pd.read_pickle(file_path + "2017" + file_path_2)
df_18= pd.read_pickle(file_path + "2018" + file_path_2)
df_19= pd.read_pickle(file_path + "2019" + file_path_2)
df_20= pd.read_pickle(file_path + "2020" + file_path_2)
df_21= pd.read_pickle(file_path + "2021" + file_path_2)
df_22= pd.read_pickle(file_path + "2022" + file_path_2)

In [19]:
df_all = pd.concat([df_10, df_11, df_12, df_13, df_14, df_15, df_16, df_17, df_18, df_19, df_20, df_21, df_22])

In [20]:
df_all

Unnamed: 0,pre_processed_sent,string_rnn,male_count,female_count,Proper_noun_list,pn exists,sentences,article_id,year,col_type
0,"[say, delight, restored, bridge, back, use]",say delight restored bridge back use,1,0,[Southease],,"Chairman of Southease Parish, Neville Harrison...",1,2010,0
1,"[family, year, old, kill, house, fire, pay, tr...",family year old kill house fire pay tribute br...,0,2,[],,The family of a 34-year-old mother from Bristo...,21,2010,1
2,"[family, say, kind, totally, dedicated]",family say kind totally dedicated,1,4,[],,Her family said she was kind and a totally ded...,21,2010,1
3,"[truly, tragic, love, family, everything, give...",truly tragic love family everything give famil...,0,4,[Sara],,"'Truly tragic'""Sara loved her family above eve...",21,2010,1
4,"[everybody, know, love, miss, always]",everybody know love miss always,0,3,[],,"""Everybody who knew her will love her and miss...",21,2010,1
...,...,...,...,...,...,...,...,...,...,...
410780,"[ground, hard, thaw, yet, come, experience, sh...",ground hard thaw yet come experience show u th...,1,0,[],,"""The ground has been very hard, the thaw is ye...",1043999,2022,0
410781,"[endeavour, get, ahead, game, best, find, situ...",endeavour get ahead game best find situation c...,1,0,[],,"""What we are endeavouring to do here is get ah...",1043999,2022,0
410782,"[die, hit, garage, forecourt]",die hit garage forecourt,1,0,[],,A man has died after being hit by a 4x4 on a g...,1044006,2022,0
410783,"[north, say, incident, garage, happen, area, t...",north say incident garage happen area take hos...,2,0,"[Wales, Chester]",True,North Wales Police said the incident at the Pr...,1044006,2022,0


In [6]:
def absolute_count(male_col, female_col):
    if female_col > male_col and male_col == 0:
        return 1
    elif male_col> female_col and female_col ==0: 
        return 0
    else: 
        return None

In [7]:
#apply function to only get rows with an absolute count 
df_all['col_type'] = df_all.apply(lambda row: absolute_count(row['male_count'], row['female_count']),axis=1)

#remove nulls 
df_all = df_all[df_all["col_type"].notnull()]

#DOC: number of male and female columns
df_all["col_type"].value_counts()  

0.0    6928463
1.0    2736784
Name: col_type, dtype: int64

In [8]:
df_all

Unnamed: 0,pre_processed_sent,string_rnn,male_count,female_count,Proper_noun_list,pn exists,sentences,article_id,year,col_type
0,"[say, delight, restored, bridge, back, use]",say delight restored bridge back use,1,0,[Southease],,"Chairman of Southease Parish, Neville Harrison...",1,2010,0.0
1,"[family, year, old, kill, house, fire, pay, tr...",family year old kill house fire pay tribute br...,0,2,[],,The family of a 34-year-old mother from Bristo...,21,2010,1.0
3,"[truly, tragic, love, family, everything, give...",truly tragic love family everything give famil...,0,4,[Sara],,"'Truly tragic'""Sara loved her family above eve...",21,2010,1.0
4,"[everybody, know, love, miss, always]",everybody know love miss always,0,3,[],,"""Everybody who knew her will love her and miss...",21,2010,1.0
5,"[truly, tragic, event, keen, determine, exactl...",truly tragic event keen determine exactly lead...,0,1,[],,"""This was a truly tragic event and we are very...",21,2010,1.0
...,...,...,...,...,...,...,...,...,...,...
410780,"[ground, hard, thaw, yet, come, experience, sh...",ground hard thaw yet come experience show u th...,1,0,[],,"""The ground has been very hard, the thaw is ye...",1043999,2022,0.0
410781,"[endeavour, get, ahead, game, best, find, situ...",endeavour get ahead game best find situation c...,1,0,[],,"""What we are endeavouring to do here is get ah...",1043999,2022,0.0
410782,"[die, hit, garage, forecourt]",die hit garage forecourt,1,0,[],,A man has died after being hit by a 4x4 on a g...,1044006,2022,0.0
410783,"[north, say, incident, garage, happen, area, t...",north say incident garage happen area take hos...,2,0,"[Wales, Chester]",True,North Wales Police said the incident at the Pr...,1044006,2022,0.0


In [9]:
df_reduced = df_all[["pre_processed_sent", "col_type", "article_id", "year"]]
df_reduced

Unnamed: 0,pre_processed_sent,col_type,article_id,year
0,"[say, delight, restored, bridge, back, use]",0.0,1,2010
1,"[family, year, old, kill, house, fire, pay, tr...",1.0,21,2010
3,"[truly, tragic, love, family, everything, give...",1.0,21,2010
4,"[everybody, know, love, miss, always]",1.0,21,2010
5,"[truly, tragic, event, keen, determine, exactl...",1.0,21,2010
...,...,...,...,...
410780,"[ground, hard, thaw, yet, come, experience, sh...",0.0,1043999,2022
410781,"[endeavour, get, ahead, game, best, find, situ...",0.0,1043999,2022
410782,"[die, hit, garage, forecourt]",0.0,1044006,2022
410783,"[north, say, incident, garage, happen, area, t...",0.0,1044006,2022


In [13]:
df_probs_all= logistic_regression_year(df_reduced, 'col_type', 'pre_processed_sent')
df_probs_all


Execution time: 33025.36 seconds


(       feature       coef
 0      harrier  10.251280
 1     pregnant   8.474120
 2    headscarf   8.387455
 3        hijab   8.165802
 4       chibok   7.926305
 ..         ...        ...
 995    sheezus   3.767088
 996   chippies   3.767081
 997   thunberg   3.766711
 998   tlatlaya   3.766054
 999  roundmoor   3.765732
 
 [1000 rows x 2 columns],
         feature      coef
 0          joao -3.415070
 1    decolonise -3.417159
 2         bosco -3.417262
 3         tupac -3.417617
 4         dally -3.418468
 ..          ...       ...
 995     batsman -7.056664
 996  crossbench -7.877373
 997    gatherer -7.928533
 998    prostate -8.027753
 999  fatherhood -8.211420
 
 [1000 rows x 2 columns],
 LogisticRegression(C=10, class_weight={0: 0.3, 1: 0.7}, random_state=42,
                    solver='newton-cg'),
 '              precision    recall  f1-score   support\n\n         0.0       0.82      0.71      0.76   1385268\n         1.0       0.45      0.60      0.51    547782\n\n    accura

**Decade Model Performance**

In [14]:
#open model performance metrics 
with open('results_decade.pkl', 'rb') as f:
    results = pickle.load(f)

accuracy = results['accuracy']
report = results['report']

print(f"Accuracy: {accuracy:.2f}")
print(f"Classification report:\n{report}")

Accuracy: 0.68
Classification report:
              precision    recall  f1-score   support

         0.0       0.82      0.71      0.76   1385268
         1.0       0.45      0.60      0.51    547782

    accuracy                           0.68   1933050
   macro avg       0.63      0.66      0.64   1933050
weighted avg       0.71      0.68      0.69   1933050



In [17]:
#create DF of highest coef
highest_coef_decade = pd.DataFrame(df_probs_all[0])
highest_coef_decade["coef_type"] = "highest"
#highest_coef_decade["year"] = year

#create DF of lowest lowest coef manipulation 
lowest_coef_decade = pd.DataFrame(df_probs_all[1]) 
lowest_coef_decade = lowest_coef_decade.sort_values(by = ["coef"], ascending = True).reset_index(drop = True) #absolute lowest value 
lowest_coef_decade["coef_type"] = "lowest" #coef type
#lowest_coef_decade["year"] = year #year 
lowest_coef_decade

Unnamed: 0,feature,coef,coef_type
0,fatherhood,-8.211420,lowest
1,prostate,-8.027753,lowest
2,gatherer,-7.928533,lowest
3,crossbench,-7.877373,lowest
4,batsman,-7.056664,lowest
...,...,...,...
995,dally,-3.418468,lowest
996,tupac,-3.417617,lowest
997,bosco,-3.417262,lowest
998,decolonise,-3.417159,lowest


In [18]:
#save DF as pickle file per year 
lowest_coef_decade.to_pickle('RESULTSdecade_coef_low.pickle')
highest_coef_decade.to_pickle('RESULTSdecade_coef_high.pickle')