In [4]:
import pandas as pd

data_path = '2021WORK.csv'
data = pd.read_csv(data_path)
data.head(), data.describe(include='all')


(   year                  Area  Full-time  Full-time 31 to 48 hours worked  \
 0  2021                London    3137424                          2555494   
 1  2021            Hartlepool      25787                            22207   
 2  2021         Middlesbrough      37410                            32556   
 3  2021  Redcar and Cleveland      37621                            31940   
 4  2021      Stockton-on-Tees      60758                            52004   
 
    Full-time 49 or more hours worked  
 0                             581930  
 1                               3580  
 2                               4854  
 3                               5681  
 4                               8754  ,
           year    Area     Full-time  Full-time 31 to 48 hours worked  \
 count    176.0     176  1.760000e+02                     1.760000e+02   
 unique     NaN     176           NaN                              NaN   
 top        NaN  London           NaN                              

In [7]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Remove the 'year' column as it is the same for all entries
data.drop('year', axis=1, inplace=True)

# Encode the 'Area' column
le = LabelEncoder()
data['Area'] = le.fit_transform(data['Area'])

# Standardize the numerical data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data.iloc[:, 1:])  # Apply scaler on numerical columns

processed_data = pd.DataFrame(data_scaled, columns=data.columns[1:])
processed_data['Area'] = data['Area']  # Add the encoded 'Area' column back

processed_data.head()


Unnamed: 0,Full-time,Full-time 31 to 48 hours worked,Full-time 49 or more hours worked,Area
0,12.302938,12.267948,12.399326,86
1,-0.420313,-0.429301,-0.378718,62
2,-0.372787,-0.37743,-0.35057,92
3,-0.371925,-0.380517,-0.332298,118
4,-0.277319,-0.279953,-0.264404,140


In [8]:
from sklearn.decomposition import PCA

# PCA
pca = PCA(n_components=2)
pca_results = pca.fit_transform(processed_data)

pca_df = pd.DataFrame(pca_results, columns=['PCA1', 'PCA2'])
pca_df['Area'] = data['Area']  # Add the 'Area' column for visualization purposes

pca_df.head()


Unnamed: 0,PCA1,PCA2,Area
0,-1.507055,21.3442,86
1,-25.49976,-0.717643,62
2,4.500212,-0.634063,92
3,30.500209,-0.616203,118
4,52.500155,-0.457025,140


In [9]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(pca_df.iloc[:, :2])  # Use only PCA components

tsne_df = pd.DataFrame(tsne_results, columns=['TSNE1', 'TSNE2'])
tsne_df['Area'] = pca_df['Area']  # Add the 'Area' column for visualization purposes

tsne_df.head()


Unnamed: 0,TSNE1,TSNE2,Area
0,-0.446696,1.106585,86
1,-4.919,-0.191588,62
2,0.972902,-0.040856,92
3,6.066119,0.043907,118
4,10.401743,0.12887,140


In [11]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler


processed_data['Area'] = le.inverse_transform(processed_data['Area'])
pca_df['Area'] = le.inverse_transform(pca_df['Area']) 
tsne_df['Area'] = le.inverse_transform(tsne_df['Area'])  


pca_file_path = 'PCA_results_decoded.csv'
pca_df.to_csv(pca_file_path, index=False)

tsne_file_path = 'tSNE_results_decoded.csv' 
tsne_df.to_csv(tsne_file_path, index=False)
