In [1]:
#imports
import pandas as pd
from imblearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import LinearSVC, SVC
from sklearn import metrics
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Let's us read the input data
df = pd.read_csv('/content/drive/MyDrive/AI4gov_CPV/data_cpv/divisions.csv')
#df = pd.read_csv('/content/drive/MyDrive/AI4gov_CPV/data_cpv/groups.csv')
df.head()

Unnamed: 0,Identificador,Objeto del Contrato,CPV,Tipo de contrato
0,6834288,"Servicio de recepción, apertura y cierre, cont...",79000000,Servicios
1,6839960,Servicio de mantenimiento de la aplicación inf...,72000000,Servicios
2,6092023,Prestación del servicio de transporte escolar ...,60000000,Servicios
3,6839857,Servicios artísticos para la representación en...,92000000,Servicios
4,6839858,Asistencia técnica para la preparación del Pro...,79000000,Servicios


In [4]:
# Remove identificador and Tipo de contrato, which are not needed
df = df.drop(['Identificador','Tipo de contrato'], axis = 1)

In [5]:
# Let us explore the size of the data
df.count()

Objeto del Contrato    118781
CPV                    118781
dtype: int64

In [6]:
# Everything looks ok. Now let's do the train-split as we did before.
X = df['Objeto del Contrato'] # Text
Y = df['CPV'] # Labels
# Note that now we have many classes:
df.groupby(['CPV']).count()

Unnamed: 0_level_0,Objeto del Contrato
CPV,Unnamed: 1_level_1
3000000,1618
9000000,2342
14000000,277
15000000,1202
16000000,209
18000000,1644
19000000,130
22000000,572
24000000,793
30000000,3784


In [7]:
# Classify
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
clf = Pipeline([
    ('vectorizer', CountVectorizer(analyzer="word",
                                    tokenizer=word_tokenize,
                                    max_features=None,
                                    lowercase=True)),
    ('linearsvc', LinearSVC(random_state=42))
])
clf.fit(X_train, y_train)



In [8]:
# Let's see the results
y_pred = clf.predict(X_test)
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     3000000       0.79      0.79      0.79       303
     9000000       0.88      0.87      0.88       486
    14000000       0.71      0.44      0.54        50
    15000000       0.86      0.79      0.83       263
    16000000       0.52      0.50      0.51        44
    18000000       0.65      0.63      0.64       335
    19000000       0.36      0.24      0.29        17
    22000000       0.64      0.59      0.62       118
    24000000       0.68      0.58      0.63       168
    30000000       0.71      0.75      0.73       775
    31000000       0.43      0.38      0.40       321
    32000000       0.47      0.42      0.45       245
    33000000       0.82      0.84      0.83      1519
    34000000       0.72      0.76      0.74       845
    35000000       0.53      0.43      0.47       230
    37000000       0.54      0.62      0.58        97
    38000000       0.63      0.58      0.61       301
    39000000       0.65    

In [9]:
# 03000000 is agriculture, farming
# 55000000 is hotel, restaurant trade services
# 33000000 is medical equipment
# 45000000 is construction work
# 30000000 is office supplies, machinery, etc.
test = clf.predict(["Semillas para la siembra",
                   "Alquiler en bar del paramo",
                   "Obras de metro y remodelación",
                   "Materiales de oficina",
                   "Grúas para demolición",
                   "Gruas para demolicion"])
test

array([ 3000000, 55000000, 45000000, 30000000, 45000000, 33000000])

In [12]:
# Problems? Result look ok-ish. But let's try:
test = clf.predict(["Gazpacho, paella, croquetas"])
# test = clf.predict(["Daniel es un investigador en la Universidad"])
test


array([45000000])

In [13]:
# There is no "other" class, so our classifier will always return a class.
# Also, there are classes with little or no representation,

# Let's create an "Other" class, made of all the categories with less than 500
# items. That way we remove those with much representation, and at the same time
# allow for "Other" when we don't know if a CPV is there or not

# Which CPVs appear 1000 or less?
count = df.groupby(['CPV']).count()
cpvs_to_filter = []
for ind, row in count.iterrows():
  #if row['Objeto del Contrato'] < 1000:
  if row['Objeto del Contrato'] < 700:
    cpvs_to_filter.append(ind)
    #print(ind)
#cpvs_to_filter

# Add all CPVS to filter as "Other (class 0)"

for ind, row in df.iterrows():
    if row['CPV'] in cpvs_to_filter:
        df.at[ind,'CPV'] = 0
df.head()

# Classify again

# Test!

Unnamed: 0,Objeto del Contrato,CPV
0,"Servicio de recepción, apertura y cierre, cont...",79000000
1,Servicio de mantenimiento de la aplicación inf...,72000000
2,Prestación del servicio de transporte escolar ...,60000000
3,Servicios artísticos para la representación en...,92000000
4,Asistencia técnica para la preparación del Pro...,79000000


In [14]:
# Let us check the results to make sure that we have an "Other" class decently
# populated
df.groupby(['CPV']).count()

Unnamed: 0_level_0,Objeto del Contrato
CPV,Unnamed: 1_level_1
0,3278
3000000,1618
9000000,2342
15000000,1202
18000000,1644
24000000,793
30000000,3784
31000000,1532
32000000,1216
33000000,7511


In [15]:
# Now let us re-train the previous classifier and see the results
X_new = df['Objeto del Contrato'] # Text
Y_new = df['CPV'] # Labels
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new, Y_new, test_size=0.2)
clf.fit(X_train_new, y_train_new)
y_pred_new = clf.predict(X_test_new)
print(metrics.classification_report(y_test_new, y_pred_new))



              precision    recall  f1-score   support

           0       0.46      0.39      0.43       662
     3000000       0.86      0.82      0.84       334
     9000000       0.91      0.89      0.90       481
    15000000       0.83      0.83      0.83       229
    18000000       0.70      0.67      0.68       329
    24000000       0.64      0.57      0.61       145
    30000000       0.71      0.75      0.73       793
    31000000       0.52      0.38      0.44       311
    32000000       0.51      0.51      0.51       222
    33000000       0.80      0.84      0.82      1463
    34000000       0.77      0.76      0.76       949
    35000000       0.52      0.45      0.48       212
    38000000       0.65      0.55      0.60       330
    39000000       0.60      0.59      0.60       448
    42000000       0.46      0.38      0.42       301
    44000000       0.64      0.65      0.64       587
    45000000       0.86      0.90      0.88      3881
    48000000       0.62    

In [16]:
# 03000000 is agriculture, farming
# 55000000 is hotel, restaurant trade services
# 33000000 is medical equipment
# 45000000 is construction work
# 30000000 is office supplies, machinery, etc.
# Finally, let's try the examples again:
test = clf.predict(["Semillas para la siembra",
                   "Alquiler en bar del paramo",
                   "Obras de metro y remodelación",
                   "Materiales de oficina",
                   "Alquiler grúas para demolición",
                   "Gruas para demolicion"])
test

array([ 3000000, 55000000, 45000000, 30000000, 45000000, 33000000])

In [17]:
# Unfortunately, the results for "Other" are still not very good,
# which means that our "other" category needs some work
test = clf.predict(["Dinosaurios para el espacio exterior"]) # This means materials for outer space
test

array([45000000])

In [None]:
# A way to improve this classifier may be through a stopword/lemmatization/accent removal.