In [1]:
from methods_classification import *

This notebook contains:
1. Preparation of data structure for classification of news images
2. Model implementation - model with the best test results for the current problem was chosen, i.e. Support Vector Machine with RBF kernel
3. Copying process of images to the folders with their classification (i.e., if the embedding of news image xx.jpg was labelled to be 'Politician A', then the image xx.jpg is written to that respective folder

## 1. Data infrastructure

In [2]:
embeddings = pd.read_parquet('datasets/embeddings/embedded_politicians_pca_tsne.parquet')
nos_nu_embeddings = pd.read_parquet('datasets/embeddings/embedded_news_images_nos_nu.parquet')

In [4]:
print(f'The train/test dataset containing unique embeddings\nof both politicians and unknowns\nis of shape {embeddings.shape}\nand contains the columns:\n{embeddings.columns.tolist()}\n\n')
print(f'The cut-out news images dataset containing unique embeddings\nof both politicians and unknowns\nis of shape {nos_nu_embeddings.shape} with both NOS (N={nos_nu_embeddings[nos_nu_embeddings['dataset']=='NOS'].shape[0]}) and NU (N={nos_nu_embeddings[nos_nu_embeddings['dataset']=='NU'].shape[0]}) images\nand contains the columns:\n{nos_nu_embeddings.columns.tolist()}')

The train/test dataset containing unique embeddings
of both politicians and unknowns
is of shape (4287, 6)
and contains the columns:
['embedding', 'dataset', 'image_filename', 'image_path', 'PCA_64D', 'tSNE_2D']


The cut-out news images dataset containing unique embeddings
of both politicians and unknowns
is of shape (11343, 4) with both NOS (N=5627) and NU (N=5716) images
and contains the columns:
['embedding', 'dataset', 'image_filename', 'image_path']


In [4]:
label_encoder = LabelEncoder()
labels = embeddings['dataset']
y = label_encoder.fit_transform(labels)
embeddings['dataset_encoded'] = y

## 2. Model implementation:
## SVM with rbf kernel, 512 dimensions, and threshold 0.4
### a. Data preparation
The data is no longer split into train and test; all samples are used for improved prediction of the news images

In [5]:
model_data = embeddings[embeddings['dataset'] != 'Unknown']

X = np.vstack(model_data['embedding'].values)
y = model_data['dataset_encoded']

X_news = np.vstack(nos_nu_embeddings['embedding'].values)

print(f"Samples used to fit the model: {model_data.shape}\nNumber of news images to predict: {X_news.shape}")

Samples used to fit the model: (3598, 7)
Number of news images to predict: (11343, 512)


### b. Create pipeline

In [7]:
pipe_svc = Pipeline([('scaler', StandardScaler()), ('svc_rbf', SVC(kernel='rbf', random_state = 7, probability=True))])

### c. Fit the pipeline, obtain probabilities

In [8]:
pipe_svc.fit(X, y)

In [9]:
probabilities = pipe_svc.predict_proba(X_news)
max_prob_info = [max_probability_info(row) for row in probabilities]
pred_class, max_prob = zip(*max_prob_info)

### d. Correct predictions for 1) unknown class, 2) with threshold

In [23]:
# Set threshold
threshold_1 = 0.4
threshold_2 = 0.7

In [11]:
# Create DF for easy access
df_pred = pd.DataFrame({
            'X': X_news.tolist(),
            'max_prob': max_prob,
            'pred': pred_class
        })
# Correct argmax function for the class with label 17 at index 16
df_pred['pred'] = df_pred['pred'].apply(lambda x: 17 if x == 16 else x)

In [24]:
# Recalculate predictions with implemented threshold function
df_pred['pred_threshold_1'] = df_pred.apply(
    lambda row: 16 if row['max_prob'] < threshold_1 else row['pred'], axis=1
)
df_pred['pred_threshold_2'] = df_pred.apply(
    lambda row: 16 if row['max_prob'] < threshold_2 else row['pred'], axis=1
)

In [25]:
df_pred['pred_threshold_label_1'] = label_encoder.inverse_transform(df_pred['pred_threshold_1'])
df_pred['pred_threshold_label_2'] = label_encoder.inverse_transform(df_pred['pred_threshold_2'])

## 3. Write images to labelled folders in preparation of manual correction
### a. Prepare dataframe

In [15]:
nos_nu_embeddings['id'] = nos_nu_embeddings['image_filename'].apply(lambda x: x.split('_')[0])

In [26]:
nos_nu_embeddings['max_prob'] = df_pred['max_prob']
nos_nu_embeddings['pred_label_t1'] = df_pred['pred_threshold_label_1']
nos_nu_embeddings['pred_label_t2'] = df_pred['pred_threshold_label_2']

In [29]:
nos_nu_embeddings.columns

Index(['embedding', 'dataset', 'image_filename', 'image_path', 'max_prob',
       'pred_label', 'id', 'pred_label_t1', 'pred_label_t2'],
      dtype='object')

### b. Split NOS / NU

In [32]:
nos_embeddings = nos_nu_embeddings[nos_nu_embeddings['dataset'] == 'NOS']
nu_embeddings = nos_nu_embeddings[nos_nu_embeddings['dataset'] == 'NU']

#### i. NOS

In [34]:
nos_input_folder = 'datasets/images/isolated_news_faces_nos'
nos_output_folder_t1 = 'datasets/images/svc_classifications/NOS_t1'
nos_output_folder_t2 = 'datasets/images/svc_classifications/NOS_t2'

In [18]:
write_classifications_to_folder(nos_input_folder, nos_output_folder_t1, nos_embeddings, 't1')

In [42]:
write_classifications_to_folder(nos_input_folder, nos_output_folder_t2, nos_embeddings, 't2')

#### ii. NU

In [36]:
nu_input_folder = 'datasets/images/isolated_news_faces_nu'
nu_output_folder_t1 = 'datasets/images/svc_classifications/NU_t1'
nu_output_folder_t2 = 'datasets/images/svc_classifications/NU_t2'

In [20]:
write_classifications_to_folder(nu_input_folder, nu_output_folder_t1, nu_embeddings, 't1')

In [43]:
write_classifications_to_folder(nu_input_folder, nu_output_folder_t2, nu_embeddings, 't2')

#### iii. Write uncorrected dataframe to file

In [38]:
nos_nu_embeddings.to_parquet('datasets/embeddings/nos_nu_labelledembeddings_labelled_uncorrected_t1t2.parquet')