In [1]:
import pandas as pd
import time 
import pickle
import os
#nltk.download('punkt')


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import warnings 
warnings.filterwarnings("ignore")

In [4]:
year = "2019"
file_path = "/Users/yolandaferreirofranchi/Desktop/ThesisDatasets/"
file_path_2 = "_final_rnn.pickle"

df_19= pd.read_pickle(file_path + year + file_path_2)
df_19

Unnamed: 0,pre_processed_sent,string_rnn,male_count,female_count,Proper_noun_list,pn exists,sentences,article_id,year,col_type
0,"[offer, help, university, budget, help, find, ...",offer help university budget help find per stu...,1,0,[],,He offered to help both universities with thei...,8,2019,0
1,"[die, knock, car, name, police]",die knock car name police,1,0,[],,A man who died after being knocked down by a c...,43,2019,0
2,"[three, age, arrest, suspicion, murder, releas...",three age arrest suspicion murder release bail,1,0,[Harlow],,"Three men from Harlow, aged 35, 34 and 25, hav...",43,2019,0
3,"[touch, plane, one, land, airbase, refuel, stop]",touch plane one land airbase refuel stop,1,0,"[Donald, Trump]",True,Donald Trump touched down in the UK when his p...,51,2019,0
4,"[post, say, refuel, stop]",post say refuel stop,0,1,[],,"In her post, Ms Sanders said: ""Got off AF1 for...",51,2019,1
...,...,...,...,...,...,...,...,...,...,...
817190,"[sat, bed, north, indie, rock, frontman, refle...",sat bed north indie rock frontman reflect busi...,1,0,[],,"Sat on his bed in north London, the indie-rock...",2175805,2019,0
817191,"[idea, go, hard, time, two, year, sigh]",idea go hard time two year sigh,1,0,[],,"""But I had no idea I was going to have such a ...",2175805,2019,0
817192,"[think, idea, building, show, around, audience...",think idea building show around audience phone...,0,1,[],,"""She thinks the idea of building a show around...",2175906,2019,1
817193,"[friend, like, always, film, everything, gig, ...",friend like always film everything gig say imp...,0,1,[],,"""Her friend Liam, who likes to ""always film ev...",2175906,2019,1


In [5]:
def absolute_count(male_col, female_col):
    if female_col > male_col and male_col == 0:
        return 1
    elif male_col> female_col and female_col ==0: 
        return 0
    else: 
        return None

In [6]:
#apply function to only get rows with an absolute count 
df_19['col_type'] = df_19.apply(lambda row: absolute_count(row['male_count'], row['female_count']),axis=1)

#remove nulls 
df_19 = df_19[df_19["col_type"].notnull()]

#DOC: number of male and female columns
df_19["col_type"].value_counts()  

0.0    522364
1.0    254067
Name: col_type, dtype: int64

**Define TFIDF Vectorizer**

In [7]:
#tfidf vectorizer
def fake(token):
    return token

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=fake,
    preprocessor=fake,
    token_pattern=None)  

**Logistic Regression Classifier**

*What the LR model does-* LR estimates the probability of an instance belonging to the positive class. 

In [8]:
def logistic_regression_year(df, target_col, text_col):
    #start timer 
    start_time = time.time()
    
    #split data 
    X = df[text_col].apply(lambda x: str(x))
    y = df[target_col]

    #train test split
    tfidf = TfidfVectorizer()
    X_transformed = tfidf.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

    # define the hyperparameters to search over
    param_grid = {
        'penalty': ['l1', 'l2'],
        'C': [0.1, 1, 10, 100],
        'solver': ['lbfgs', 'newton-cg', 'sag' 'saga'], #removed liblinear as it is for small + medium datasets & NOT for sparse data
        'class_weight': ['balanced', {0: 0.3, 1: 0.7}],
        'random_state': [42]
    }

    #the classifier 
    clf = LogisticRegression()

    #create a GridsearchCV object 
    grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_clf = LogisticRegression(**grid_search.best_params_)

    #run the classifier 
    best_clf.fit(X_train, y_train)
    y_pred = best_clf.predict(X_test)

    #performance 
    accuracy = best_clf.score(X_test, y_test) #evaluate on test set
    class_report = classification_report(y_test, y_pred, zero_division = 0)
    #results = {'accuracy': accuracy, 'classification_report': class_report}
    #print(f"Accuracy: {accuracy:.2f}")
    #print(f"Classification Report:\n{class_report}")

    #coefficients
    coefs = best_clf.coef_[0]
    sorted_coef = sorted((zip(tfidf.get_feature_names_out(), coefs)), key = lambda x: x[1], reverse=True)
    high_coef = sorted_coef[:1000]
    low_coef = sorted_coef[-1000:]
    
    df_high_coef = pd.DataFrame(high_coef, columns=['feature', 'coef'])
    df_low_coef = pd.DataFrame(low_coef, columns=['feature', 'coef'])

    #save model 
    with open('results.pkl', 'wb') as f:
        pickle.dump({'model': best_clf, 'tfidf': tfidf, 'accuracy': accuracy, 'report': class_report}, f)

    #end timer 
    end_time = time.time()
    print(f"\nExecution time: {end_time - start_time:.2f} seconds")

    return df_high_coef, df_low_coef, best_clf, class_report #df_probs_top

**Coefficient Analysis**

In [9]:
df_probs_19 = logistic_regression_year(df_19, 'col_type', 'pre_processed_sent')
df_probs_19


Execution time: 882.50 seconds


(        feature      coef
 0    motherhood  8.534818
 1     headscarf  8.127457
 2      pregnant  8.106356
 3     pregnancy  7.950200
 4     menopause  7.913446
 ..          ...       ...
 995       girly  4.061953
 996         eds  4.061669
 997    mbandaka  4.061367
 998    maitland  4.061349
 999        macy  4.061340
 
 [1000 rows x 2 columns],
              feature       coef
 0        unforgiving  -3.911172
 1    cardiopulmonary  -3.912530
 2            shimbun  -3.912889
 3      dispassionate  -3.914104
 4           abundant  -3.914345
 ..               ...        ...
 995             nate  -7.247050
 996           willie  -7.446785
 997             pell  -7.645863
 998       takeoverif  -8.232578
 999         prostate -10.090317
 
 [1000 rows x 2 columns],
 LogisticRegression(C=10, class_weight='balanced', random_state=42,
                    solver='newton-cg'),
 '              precision    recall  f1-score   support\n\n         0.0       0.81      0.69      0.75    104377\n 

*Interpreting Performance*

The LR model is WAY better in terms of precision, recall, and f1-score at predicting the negative class - i.e. male. 

In [11]:
#open model performance metrics 
with open('results_22.pkl', 'rb') as f:
    results = pickle.load(f)

accuracy = results['accuracy']
report = results['report']

print(f"Accuracy: {accuracy:.2f}")
print(f"Classification report:\n{report}")

Accuracy: 0.64
Classification report:
              precision    recall  f1-score   support

         0.0       0.76      0.65      0.70     49956
         1.0       0.50      0.63      0.55     27769

    accuracy                           0.64     77725
   macro avg       0.63      0.64      0.63     77725
weighted avg       0.66      0.64      0.65     77725



In [None]:
#create DF of highest coef
highest_coef_22 = pd.DataFrame(df_probs_22[0])
highest_coef_22["coef_type"] = "highest"
highest_coef_22["year"] = year

#create DF of lowest lowest coef manipulation 
lowest_coef_22 = pd.DataFrame(df_probs_22[1]) 
lowest_coef_22 = lowest_coef_22.sort_values(by = ["coef"], ascending = True).reset_index(drop = True) #absolute lowest value 
lowest_coef_22["coef_type"] = "lowest" #coef type
lowest_coef_22["year"] = year #year 
highest_coef_22

In [None]:
#save DF as pickle file per year 
lowest_coef_22.to_pickle('RESULTS22_coef_low.pickle')
highest_coef_22.to_pickle('RESULTS22_coef_high.pickle')

**Interpretation of coefficient results**
"winner" is one of the independent variables in the model and its coefficient value is 1.2865737872946597. This means that a one unit increase in the value of the "winner" variable will increase the log-odds of the positive class (e.g. "female" if the logistic regression model is binary and predicting gender) by the corresponding coefficient value, while holding all other variables constant.

**Interpretation of the predicted probabilities in LR:** 
The predicted probabilities of the logistic regression model tell us the probability that the input data belongs to the positive class - in this case the female class as we attributed it a value = 1 in binary log reg. Hence, for each word, we get a list of a word/feature and the probability that it is female. 

These predicted probabilities can be interpreted as the confidence level of the model in its prediction. For example, a predicted probability of 0.8 for a positive class means that the model is 80% confident that the sample belongs to the positive class. 

In [None]:
def load_pickle_files_low(directory):
    objects = []
    for filename in os.listdir(directory):
        if filename.endswith("low.pickle"):
            with open(os.path.join(directory, filename), 'rb') as file:
                obj = pickle.load(file)
                objects.append(obj)
    return objects

def load_pickle_files_high(directory):
    objects = []
    for filename in os.listdir(directory):
        if filename.endswith("high.pickle"):
            with open(os.path.join(directory, filename), 'rb') as file:
                obj = pickle.load(file)
                objects.append(obj)
    return objects

In [None]:
#stack dfs together for low and high coefs 
df_low = load_pickle_files_low(r"/Users/yolandaferreirofranchi/Documents/GitHub/Masters-Thesis")
df_high = load_pickle_files_high(r"/Users/yolandaferreirofranchi/Documents/GitHub/Masters-Thesis")
df_high