In [1]:
import pandas as pd
import time 
import pickle
import os
#nltk.download('punkt')


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import warnings 
warnings.filterwarnings("ignore")

In [2]:
year = "2022"
file_path = "/Users/yolandaferreirofranchi/Desktop/ThesisDatasets/"
file_path_2 = "_final_rnn.pickle"

df_22= pd.read_pickle(file_path + year + file_path_2)
df_22

Unnamed: 0,pre_processed_sent,string_rnn,male_count,female_count,Proper_noun_list,pn exists,sentences,article_id,year,col_type
0,"[add, three, decade, building, extremely, prou...",add three decade building extremely proud grou...,1,0,"[Balhousie, Care]",True,"He added: ""After three decades of building Bal...",13,2022,0
1,"[hospitalise, number, facial, fracture, follow...",hospitalise number facial fracture follow seri...,2,0,[],,A man in his 20s has been hospitalised with a ...,24,2022,0
2,"[victim, front, passenger, seat, vehicle, atta...",victim front passenger seat vehicle attack ano...,5,0,[],,The victim was in the front passenger seat of ...,24,2022,0
3,"[involve, bitter, dispute, organisation, membe...",involve bitter dispute organisation member sta...,0,1,[Baroness],,"Baroness Scotland, Secretary General of the Co...",30,2022,1
4,"[add, chair, regret, challenge, position, take...",add chair regret challenge position take secre...,1,0,[],,"""He adds: ""The chair regrets and challenges th...",30,2022,0
...,...,...,...,...,...,...,...,...,...,...
410780,"[ground, hard, thaw, yet, come, experience, sh...",ground hard thaw yet come experience show u th...,1,0,[],,"""The ground has been very hard, the thaw is ye...",1043999,2022,0
410781,"[endeavour, get, ahead, game, best, find, situ...",endeavour get ahead game best find situation c...,1,0,[],,"""What we are endeavouring to do here is get ah...",1043999,2022,0
410782,"[die, hit, garage, forecourt]",die hit garage forecourt,1,0,[],,A man has died after being hit by a 4x4 on a g...,1044006,2022,0
410783,"[north, say, incident, garage, happen, area, t...",north say incident garage happen area take hos...,2,0,"[Wales, Chester]",True,North Wales Police said the incident at the Pr...,1044006,2022,0


In [3]:
def absolute_count(male_col, female_col):
    if female_col > male_col and male_col == 0:
        return 1
    elif male_col> female_col and female_col ==0: 
        return 0
    else: 
        return None

In [4]:
#apply function to only get rows with an absolute count 
df_22['col_type'] = df_22.apply(lambda row: absolute_count(row['male_count'], row['female_count']),axis=1)

#remove nulls 
df_22 = df_22[df_22["col_type"].notnull()]

#DOC: number of male and female columns
df_22["col_type"].value_counts()  

0.0    250053
1.0    138572
Name: col_type, dtype: int64

**Define TFIDF Vectorizer**

In [5]:
#tfidf vectorizer
def fake(token):
    return token

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=fake,
    preprocessor=fake,
    token_pattern=None)  

**Logistic Regression Classifier**

*What the LR model does-* LR estimates the probability of an instance belonging to the positive class. 

In [15]:
def logistic_regression_year(df, target_col, text_col):
    #start timer 
    start_time = time.time()
    
    #split data 
    X = df[text_col].apply(lambda x: str(x))
    y = df[target_col]

    #train test split
    tfidf = TfidfVectorizer()
    X_transformed = tfidf.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

    # define the hyperparameters to search over
    param_grid = {
        'penalty': ['l1', 'l2'],
        'C': [0.1, 1, 10, 100],
        'solver': ['lbfgs', 'newton-cg', 'sag' 'saga'], #removed liblinear as it is for small + medium datasets & NOT for sparse data
        'class_weight': ['balanced', {0: 0.3, 1: 0.7}],
        'random_state': [42]
    }

    #the classifier 
    clf = LogisticRegression()

    #create a GridsearchCV object 
    grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_clf = LogisticRegression(**grid_search.best_params_)

    #run the classifier 
    best_clf.fit(X_train, y_train)
    y_pred = best_clf.predict(X_test)

    #performance 
    accuracy = best_clf.score(X_test, y_test) #evaluate on test set
    class_report = classification_report(y_test, y_pred, zero_division = 0)
    #results = {'accuracy': accuracy, 'classification_report': class_report}
    #print(f"Accuracy: {accuracy:.2f}")
    #print(f"Classification Report:\n{class_report}")

    #coefficients
    coefs = best_clf.coef_[0]
    sorted_coef = sorted((zip(tfidf.get_feature_names_out(), coefs)), key = lambda x: x[1], reverse=True)
    high_coef = sorted_coef[:1000]
    low_coef = sorted_coef[-1000:]
    
    df_high_coef = pd.DataFrame(high_coef, columns=['feature', 'coef'])
    df_low_coef = pd.DataFrame(low_coef, columns=['feature', 'coef'])

    #save model 
    with open('results.pkl', 'wb') as f:
        pickle.dump({'model': best_clf, 'tfidf': tfidf, 'accuracy': accuracy, 'report': class_report}, f)

    #end timer 
    end_time = time.time()
    print(f"\nExecution time: {end_time - start_time:.2f} seconds")

    return df_high_coef, df_low_coef, best_clf, class_report #df_probs_top

**Coefficient Analysis**

In [16]:
df_probs_22 = logistic_regression_year(df_22, 'col_type', 'pre_processed_sent')
df_probs_22


Execution time: 327.76 seconds


(       feature      coef
 0     pregnant  6.768437
 1    pregnancy  5.119749
 2        hijab  4.834458
 3    boyfriend  4.825778
 4    maternity  4.799544
 ..         ...       ...
 995   royalist  1.034272
 996     detest  1.034228
 997    residue  1.033310
 998      hyper  1.033004
 999     jemima  1.032871
 
 [1000 rows x 2 columns],
       feature      coef
 0        boar -1.033652
 1     graphic -1.033753
 2         toâ -1.034010
 3    pipeline -1.034017
 4      connor -1.034986
 ..        ...       ...
 995       gay -2.647956
 996    arrest -2.734533
 997   cocaine -3.258467
 998       iii -3.583065
 999  prostate -3.645832
 
 [1000 rows x 2 columns],
 LogisticRegression(C=1, class_weight='balanced', random_state=42,
                    solver='newton-cg'),
 '              precision    recall  f1-score   support\n\n         0.0       0.76      0.65      0.70     49956\n         1.0       0.50      0.63      0.55     27769\n\n    accuracy                           0.64     77725

*Interpreting Performance*

The LR model is WAY better in terms of precision, recall, and f1-score at predicting the negative class - i.e. male. 

In [17]:
#open model performance metrics 
with open('results_22.pkl', 'rb') as f:
    results = pickle.load(f)

accuracy = results['accuracy']
report = results['report']

print(f"Accuracy: {accuracy:.2f}")
print(f"Classification report:\n{report}")

Accuracy: 0.64
Classification report:
              precision    recall  f1-score   support

         0.0       0.76      0.65      0.70     49956
         1.0       0.50      0.63      0.55     27769

    accuracy                           0.64     77725
   macro avg       0.63      0.64      0.63     77725
weighted avg       0.66      0.64      0.65     77725



In [None]:
#create DF of highest coef
highest_coef_22 = pd.DataFrame(df_probs_22[0])
highest_coef_22["coef_type"] = "highest"
highest_coef_22["year"] = year

#create DF of lowest lowest coef manipulation 
lowest_coef_22 = pd.DataFrame(df_probs_22[1]) 
lowest_coef_22 = lowest_coef_22.sort_values(by = ["coef"], ascending = True).reset_index(drop = True) #absolute lowest value 
lowest_coef_22["coef_type"] = "lowest" #coef type
lowest_coef_22["year"] = year #year 
highest_coef_22

In [None]:
#save DF as pickle file per year 
lowest_coef_22.to_pickle('RESULTS22_coef_low.pickle')
highest_coef_22.to_pickle('RESULTS22_coef_high.pickle')

**Interpretation of coefficient results**
"winner" is one of the independent variables in the model and its coefficient value is 1.2865737872946597. This means that a one unit increase in the value of the "winner" variable will increase the log-odds of the positive class (e.g. "female" if the logistic regression model is binary and predicting gender) by the corresponding coefficient value, while holding all other variables constant.

**Interpretation of the predicted probabilities in LR:** 
The predicted probabilities of the logistic regression model tell us the probability that the input data belongs to the positive class - in this case the female class as we attributed it a value = 1 in binary log reg. Hence, for each word, we get a list of a word/feature and the probability that it is female. 

These predicted probabilities can be interpreted as the confidence level of the model in its prediction. For example, a predicted probability of 0.8 for a positive class means that the model is 80% confident that the sample belongs to the positive class. 

In [None]:
def load_pickle_files_low(directory):
    objects = []
    for filename in os.listdir(directory):
        if filename.endswith("low.pickle"):
            with open(os.path.join(directory, filename), 'rb') as file:
                obj = pickle.load(file)
                objects.append(obj)
    return objects

def load_pickle_files_high(directory):
    objects = []
    for filename in os.listdir(directory):
        if filename.endswith("high.pickle"):
            with open(os.path.join(directory, filename), 'rb') as file:
                obj = pickle.load(file)
                objects.append(obj)
    return objects

In [None]:
#stack dfs together for low and high coefs 
df_low = load_pickle_files_low(r"/Users/yolandaferreirofranchi/Documents/GitHub/Masters-Thesis")
df_high = load_pickle_files_high(r"/Users/yolandaferreirofranchi/Documents/GitHub/Masters-Thesis")
df_high