In [23]:
import pandas as pd
import time 
import pickle
import os
#nltk.download('punkt')

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import warnings 
warnings.filterwarnings("ignore")

In [24]:
year_list = ['2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']
file_path = "C:/Users/danie/Desktop/Masters Thesis/New Clean Data for Log Reg/"
save_path = "C:/Users/danie/Desktop/Masters Thesis/Log Reg Results/"

In [25]:
def absolute_count(male_col, female_col):
    if female_col > male_col and male_col == 0:
        return 1
    elif male_col> female_col and female_col ==0: 
        return 0
    else: 
        return None

In [26]:
#tfidf vectorizer
def fake(token):
    return token

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=fake,
    preprocessor=fake,
    token_pattern=None)  

In [27]:
def logistic_regression_year(year, df, target_col, text_col):
    #start timer 
    start_time = time.time()
    
    #split data 
    X = df[text_col].apply(lambda x: str(x))
    y = df[target_col]

    #train test split
    tfidf = TfidfVectorizer()
    X_transformed = tfidf.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

    # define the hyperparameters to search over
    param_grid = {
        'penalty': ['l1', 'l2'],
        'C': [0.1, 1, 10, 100],
        'solver': ['lbfgs', 'newton-cg', 'sag' 'saga'], #removed liblinear as it is for small + medium datasets & NOT for sparse data
        'class_weight': ['balanced', {0: 0.3, 1: 0.7}],
        'random_state': [42]
    }

    #the classifier 
    clf = LogisticRegression()

    #create a GridsearchCV object 
    grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='f1')
    grid_search.fit(X_train, y_train)
    best_clf = LogisticRegression(**grid_search.best_params_)

    #run the classifier 
    best_clf.fit(X_train, y_train)
    y_pred = best_clf.predict(X_test)

    #performance 
    accuracy = best_clf.score(X_test, y_test) #evaluate on test set
    class_report = classification_report(y_test, y_pred, zero_division = 0)
    #results = {'accuracy': accuracy, 'classification_report': class_report}
    #print(f"Accuracy: {accuracy:.2f}")
    #print(f"Classification Report:\n{class_report}")

    #coefficients
    coefs = best_clf.coef_[0]
    sorted_coef = sorted((zip(tfidf.get_feature_names_out(), coefs)), key = lambda x: x[1], reverse=True)
    high_coef = sorted_coef[:1000]
    low_coef = sorted_coef[-1000:]
    
    df_high_coef = pd.DataFrame(high_coef, columns=['feature', 'coef'])
    df_low_coef = pd.DataFrame(low_coef, columns=['feature', 'coef'])

    #save model 
    with open(save_path + year + '_results.pkl', 'wb') as f:
        pickle.dump({'model': best_clf, 'tfidf': tfidf, 'accuracy': accuracy, 'report': class_report}, f)

    #end timer 
    end_time = time.time()
    print(f"\nExecution time: {end_time - start_time:.2f} seconds")

    return df_high_coef, df_low_coef, best_clf, class_report #df_probs_top

In [28]:
df= pd.read_pickle(r"C:\Users\danie\Desktop\Masters Thesis\New Clean Data for Log Reg\2012_final_rnn.pickle")

In [30]:
def yearly_log_reg(year_list):


    for year in year_list:
        print(year, "model:")
        df= pd.read_pickle(file_path + year + "_final_rnn.pickle")

        print("Size before removal", len(df. index))
        
        #apply function to only get rows with an absolute count 
        df['col_type'] = df.apply(lambda row: absolute_count(row['male_count'], row['female_count']),axis=1)

        #remove nulls 
        df = df[df["col_type"].notnull()]

        print("Size after removal", len(df. index))

        print( "class distribution", df["col_type"].value_counts())

        df_prob = logistic_regression_year(year, df, 'col_type', 'pre_processed_sent')

        with open(save_path + year + '_logreg_model.pkl', 'wb') as handle:
            pickle.dump(df_prob, handle, protocol=pickle.HIGHEST_PROTOCOL)

        with open(save_path + year + '_results.pkl', 'rb') as f:
            results = pickle.load(f)
        
        accuracy = results['accuracy']
        report = results['report']
        print(f"Accuracy: {accuracy:.2f}")
        print(f"Classification report:\n{report}")


        highest_coef = pd.DataFrame(df_prob[0])
        highest_coef["coef_type"] = "highest"
        highest_coef["year"] = year

        highest_coef.to_pickle(save_path + year + "highest_coef")

        lowest_coef = pd.DataFrame(df_prob[1]) 
        lowest_coef = lowest_coef.sort_values(by = ["coef"], ascending = True).reset_index(drop = True) #absolute lowest value 
        lowest_coef["coef_type"] = "lowest" #coef type
        lowest_coef["year"] = year #year 
        lowest_coef.to_pickle(save_path + year + "lowest_coef")






In [31]:
yearly_log_reg(year_list)

2012 model:
Size before removal 911347
Size after removal 874527
class distribution 0.0    653092
1.0    221435
Name: col_type, dtype: int64
