In [11]:
import pandas as pd
import nltk
import re
import numpy as np
import time 
import pickle
#nltk.download('punkt')


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import metrics 
from sklearn.metrics import classification_report
import warnings 
warnings.filterwarnings("ignore")

In [2]:
#import pre-processed data from pickle 
df_09= pd.read_pickle(r"/Users/yolandaferreirofranchi/Documents/GitHub/Masters-Thesis/sample_results_df_09.pickle")
df_09

Unnamed: 0,pre_processed_sent,male_count,female_count,apicall_fail,sentences,article_id,year,col_type
0,"[break, silence, surround, break, say, band, l...",3,0,0,Liam Gallagher has broken the silence surround...,5048,2009,0
1,"[however, interview, say, longer]",2,0,0,"However, in an interview with The Times Liam G...",5048,2009,0
9,"[leave, band, follow, bust, say, simply, could...",4,0,0,Noel Gallagher left the Manchester band follow...,5048,2009,0
10,"[launch, clothing, line, earlier, year, admit,...",3,0,0,"""Liam launched his clothing line Pretty Green ...",5048,2009,0
13,"[people, able, buy, record]",1,0,0,"""People will be able to buy his records.",5048,2009,0
...,...,...,...,...,...,...,...,...
9706,"[organisers, say, extend, programme, live, tou...",2,0,0,Organisers of the X Factor have said they've e...,1043733,2009,0
9707,"[vote, judge, week, seven, competition]",1,0,0,Jedward were voted off by the X Factor judges ...,1043733,2009,0
9708,"[two, month, tour, begin, see, extra, date, add]",3,0,0,"The two month tour, which begins in Liverpool ...",1043733,2009,0
9709,"[artists, confirm, tour, contestant]",2,1,0,Artists confirmed for the tour are contestants...,1043733,2009,0


**Define TFIDF Vectorizer**

In [3]:
#tfidf vectorizer
def fake(token):
    return token

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=fake,
    preprocessor=fake,
    token_pattern=None)  

**GridSearchCV to Find Best Hyperparameters**

In [None]:
# parameter grid
parameters = {"penalty": ['l1','l2'], 
              "C": np.logspace(-3,3,7),
              "solver": ['newton-cg', 'lbfgs', 'liblinear'],
}

#GridSearchCV
logreg = LogisticRegression()
clf_logreg = GridSearchCV(logreg, 
                          param_grid = parameters, 
                          scoring = "accuracy", 
                          cv = 10)

clf_logreg.fit(X_train, y_train)

**Logistic Regression Classifier**

*What the LR model does-* LR estimates the probability of an instance belonging to the positive class. 

In [28]:
def logistic_regression_year(df, target_col, text_col):
    #start timer 
    start_time = time.time()
    
    #split data 
    X = df[text_col].apply(lambda x: str(x))
    y = df[target_col]

    #train test split
    tfidf = TfidfVectorizer()
    X_transformed = tfidf.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

    # define the hyperparameters to search over
    param_grid = {
        'penalty': ['l1', 'l2'],
        'C': [0.1, 1, 10],
        'solver': ['liblinear', 'saga']
    }

    #the classifier 
    clf = LogisticRegression()

    #create a GridsearchCV object 
    grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    best_clf = LogisticRegression(**grid_search.best_params_)

    #run the classifier 
    best_clf.fit(X_train, y_train)
    y_pred = best_clf.predict(X_test)

    #performance 
    accuracy = best_clf.score(X_test, y_test) #evaluate on test set
    class_report = classification_report(y_test, y_pred, zero_division = 0)
    results = {'accuracy': accuracy, 'classification_report': class_report}
    #print(f"Year: {year}")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Classification Report:\n{class_report}")

    #coefficients
    coefs = best_clf.coef_[0]
    sorted_coef = sorted((zip(tfidf.get_feature_names_out(), coefs)), key = lambda x: x[1], reverse=True)
    high_coef = sorted_coef[:100]
    low_coef = sorted_coef[-100:]
    
    df_high_coef = pd.DataFrame(high_coef, columns=['feature', 'coef'])
    df_low_coef = pd.DataFrame(low_coef, columns=['feature', 'coef'])


    #print probability results 
    feature_indices = {feature: idx for idx, feature in enumerate(tfidf.get_feature_names_out())}
    probas = best_clf.predict_proba(X_transformed)  # predict probabilities of positive class
    positive_probas = probas[:, 1]

    print(f"\nProbability of the Following Words Being Female:")
    feature_prob_dict = {}
    for feature, index in feature_indices.items():
        proba = (X_transformed[:, index].toarray() * positive_probas).mean()
        feature_prob_dict[feature] = proba
        print(f"{feature}: {proba}")
    
    #create a dataframe with the data: 
    #df_probs = pd.DataFrame.from_dict(feature_prob_dict, orient='index', columns=['probability'])
    
    #return df_probs #DF of probability for each word being female by year

    #end timer 
    end_time = time.time()
    print(f"\nExecution time: {end_time - start_time:.2f} seconds")

    return df_high_coef, df_low_coef

In [29]:
logistic_regression_year(df_09, 'col_type', 'pre_processed_sent')

Accuracy: 0.67
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.93      0.78       418
           1       0.66      0.25      0.36       248

    accuracy                           0.67       666
   macro avg       0.67      0.59      0.57       666
weighted avg       0.67      0.67      0.62       666


Probability of the Following Words Being Female:
abandon: 0.00030023206156630766
abc: 0.00011130085081894169
abdomen: 5.780215604835684e-05
abiding: 4.125007467491722e-05
ability: 6.199713582573354e-05
able: 0.0006405348291847932
aboard: 2.9720930865364296e-05
abound: 3.2451427839725194e-05
absence: 0.00017830982524215207
absolutely: 0.00027711892921491706
absoultely: 4.077463204136985e-05
abuse: 0.0005480620443304978
abusive: 7.690429911584644e-05
academy: 7.579289911966069e-05
accept: 0.00025598730691314377
acceptance: 3.172931111140035e-05
accepting: 4.289447014415984e-05
access: 0.0001602947857988206
accessible: 3.41411235

(     feature      coef
 0        say  2.705579
 1   newsbeat  1.619276
 2     really  1.550203
 3       time  1.426076
 4        add  1.366894
 ..       ...       ...
 95     found  0.630204
 96  mosshart  0.629733
 97      case  0.628906
 98     laura  0.627384
 99   reflect  0.626618
 
 [100 rows x 2 columns],
         feature      coef
 0           itv -0.545412
 1      upcoming -0.546671
 2           guy -0.547751
 3          also -0.551148
 4        driver -0.552802
 ..          ...       ...
 95  forthcoming -1.144666
 96      country -1.168016
 97         hold -1.217186
 98      include -1.386200
 99         band -1.414856
 
 [100 rows x 2 columns])

In [None]:
#call the function 
logistic_regression_year(df_09, 'col_type', 'pre_processed_sent')

**Coefficient Analysis** 

In [None]:
#call the function - with accuracy, classification, highest & lowest coeffs, df with word probs
df_coef_09 = logistic_regression_year(df_09, 'col_type', 'pre_processed_sent')
df_coef_09

In [None]:
#create DF of highest coef
highest_coef_09 = pd.DataFrame(df_coef_09[0]) 

#create DF of lowest lowest coef manipulation 
lowest_coef_09 = pd.DataFrame(df_coef_09[1]) 
lowest_coef_09 = lowest_coef.sort_values(by = ["coef"], ascending = True).reset_index(drop = True) #absolute lowest value 
lowest_coef_09

In [None]:
#save DF as pickle file per year 
lowest_coef_09.to_pickle('RESULTS09_coeflow_sample.pickle')
highest_coef_09.to_pickle('RESULTS09_coeflhigh_sample.pickle')
topwords_09.head(100)

**Interpretation of coefficient results**
"winner" is one of the independent variables in the model and its coefficient value is 1.2865737872946597. This means that a one unit increase in the value of the "winner" variable will increase the log-odds of the positive class (e.g. "female" if the logistic regression model is binary and predicting gender) by the corresponding coefficient value, while holding all other variables constant.

**Word Probability Analysis** 

#make sure to un-comment the last two lines of the functions

In [None]:
#call the function - with accuracy, classification, highest & lowest coeffs, df with word probs
df_probs_09 = logistic_regression_year(df_09, 'col_type', 'pre_processed_sent')
df_probs_09

In [None]:
topwords_09 = df_probs_09["probability"].sort_values(ascending= False)

#top 100 words with highest probability of belonging to the female class 
topwords_09 = pd.DataFrame(topwords_09) 
topwords_09.to_pickle('RESULTS09_sample.pickle')
topwords_09.head(100)

**Interpretation of the predicted probabilities in LR:** 
The predicted probabilities of the logistic regression model tell us the probability that the input data belongs to the positive class - in this case the female class as we attributed it a value = 1 in binary log reg. Hence, for each word, we get a list of a word/feature and the probability that it is female. 

These predicted probabilities can be interpreted as the confidence level of the model in its prediction. For example, a predicted probability of 0.8 for a positive class means that the model is 80% confident that the sample belongs to the positive class. 

**Support Vector Machine**

In [41]:
from sklearn.svm import SVC
def svm_year(df, target_col, text_col):
    #start timer 
    start_time = time.time()
    
    #split data 
    X = df[text_col].apply(lambda x: str(x))
    y = df[target_col]

    #train test split
    tfidf = TfidfVectorizer()
    X_transformed = tfidf.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

    # define the hyperparameters to search over
    param_grid = {
        'C': [.1, 1, 10],
        'gamma': ['scale', 'auto'], 
        'degree': [2,3,4]
    }

    #the classifier 
    clf = SVC()

    #create a GridsearchCV object 
    grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    best_clf = SVC(**grid_search.best_params_)
    best_clf.fit(X_train, y_train)

    #run the classifier 
    best_clf.fit(X_train, y_train)
    y_pred = best_clf.predict(X_test)

    #performance 
    accuracy = best_clf.score(X_test, y_test) #evaluate on test set
    class_report = classification_report(y_test, y_pred, zero_division = 0)
    results = {'accuracy': accuracy, 'classification_report': class_report}
    #print(f"Year: {year}")
    print(f"Accuracy: {accuracy}")
    print(f"Classification Report:\n{class_report}")

    #coefficients
    coefs = best_clf.coef_[0]
    sorted_coef = sorted((zip(tfidf.get_feature_names_out(), coefs)), key = lambda x: x[1], reverse=True)
    high_coef = sorted_coef[:100]
    low_coef = sorted_coef[-100:]
    
    df_high_coef = pd.DataFrame(high_coef, columns=['feature', 'coef'])
    df_low_coef = pd.DataFrame(low_coef, columns=['feature', 'coef'])


    #print probability results 
    feature_indices = {feature: idx for idx, feature in enumerate(tfidf.get_feature_names_out())}
    probas = best_clf.predict_proba(X_transformed)  # predict probabilities of positive class
    positive_probas = probas[:, 1]

    print(f"\nProbability of the Following Words Being Female:")
    feature_prob_dict = {}
    for feature, index in feature_indices.items():
        proba = (X_transformed[:, index].toarray() * positive_probas).mean()
        feature_prob_dict[feature] = proba
        print(f"{feature}: {proba}")
    
    #create a dataframe with the data: 
    #df_probs = pd.DataFrame.from_dict(feature_prob_dict, orient='index', columns=['probability'])
    
    #return df_probs #DF of probability for each word being female by year

    #end timer 
    end_time = time.time()
    print(f"\nExecution time: {end_time - start_time:.2f} seconds")

    return df_high_coef, df_low_coef

In [45]:
from sklearn.svm import SVC
from sklearn.inspection import permutation_importance
def svm_year(df, target_col, text_col):
    #start timer 
    start_time = time.time()
    
    #split data 
    X = df[text_col].apply(lambda x: str(x))
    y = df[target_col]

    #train test split
    tfidf = TfidfVectorizer()
    X_transformed = tfidf.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

    # define the hyperparameters to search over
    param_grid = {
        'C': [.1, 1, 10],
        'gamma': ['scale', 'auto'],
        #add kernel here too 
        # add further HPs  
        'degree': [2,3,4]
    }

    #the classifier 
    clf = SVC()

    #create a GridsearchCV object 
    grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    best_clf = SVC(**grid_search.best_params_)
    best_clf.fit(X_train, y_train)

    #run the classifier 
    best_clf.fit(X_train, y_train)
    y_pred = best_clf.predict(X_test)

    #performance 
    accuracy = best_clf.score(X_test, y_test) #evaluate on test set
    class_report = classification_report(y_test, y_pred, zero_division = 0)
    results = {'accuracy': accuracy, 'classification_report': class_report}
    #print(f"Year: {year}")
    print(f"Accuracy: {accuracy}")
    print(f"Classification Report:\n{class_report}")

    #coefficients
    result_clf = permutation_importance(best_clf, X_test, y_test, n_repeats=10, random_state=42)
    importance_scores = result_clf.importances_mean
    #importance_scores_df = pd.DataFrame(importance_scores, columns=['tbd'])

    for i in range(len(importance_scores)):
        print("Feature {}: Importance score = {:.3f}".format(i, importance_scores[i]))
    
    #coefs = best_clf.coef_[0]
    #sorted_coef = sorted((zip(tfidf.get_feature_names_out(), coefs)), key = lambda x: x[1], reverse=True)
    #high_coef = sorted_coef[:100]
    #low_coef = sorted_coef[-100:]
    
    #df_high_coef = pd.DataFrame(high_coef, columns=['feature', 'coef'])
    #df_low_coef = pd.DataFrame(low_coef, columns=['feature', 'coef'])


    #print probability results 
    feature_indices = {feature: idx for idx, feature in enumerate(tfidf.get_feature_names_out())}
    probas = best_clf.predict_proba(X_transformed)  # predict probabilities of positive class
    positive_probas = probas[:, 1]

    print(f"\nProbability of the Following Words Being Female:")
    feature_prob_dict = {}
    for feature, index in feature_indices.items():
        proba = (X_transformed[:, index].toarray() * positive_probas).mean()
        feature_prob_dict[feature] = proba
        print(f"{feature}: {proba}")
    
    #create a dataframe with the data: 
    #df_probs = pd.DataFrame.from_dict(feature_prob_dict, orient='index', columns=['probability'])
    
    #return df_probs #DF of probability for each word being female by year

    #end timer 
    end_time = time.time()
    print(f"\nExecution time: {end_time - start_time:.2f} seconds")

    return df_high_coef, df_low_coef

In [46]:
svm_year(df_09, 'col_type', 'pre_processed_sent')

Accuracy: 0.6951951951951952
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.88      0.78       418
           1       0.66      0.38      0.48       248

    accuracy                           0.70       666
   macro avg       0.68      0.63      0.63       666
weighted avg       0.69      0.70      0.67       666



TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.