In [28]:
import pandas as pd
import re

In [29]:
merchants = pd.read_csv("data/merchants.csv")
merchants.dropna(inplace=True)
merchants.info()
print("=" * 100)

receipts = pd.read_csv("data/receipts.csv")
# receipts.dropna(inplace=True)
receipts['item_name'] = receipts['item_name'].str.lower()
receipts.info()
print("=" * 100)

terminals = pd.read_csv("data/terminals.csv")
#terminals.dropna(inplace=True)
terminals['terminal_description'] = terminals['terminal_description'].astype(str).str.replace(r'[^\w\s]', '', regex=True)
terminals['terminal_description'] = terminals['terminal_description'].str.lower()
terminals.info()
print("=" * 100)

transactions = pd.read_csv("data/transactions.csv")
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   merchant_id    150 non-null    object
 1   merchant_name  150 non-null    object
 2   merchant_city  150 non-null    object
dtypes: object(3)
memory usage: 3.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4476 entries, 0 to 4475
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   transaction_id  4476 non-null   object 
 1   item_position   4476 non-null   int64  
 2   item_name       3710 non-null   object 
 3   item_price      4476 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 140.0+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298 entries, 0 to 297
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----

In [30]:
transactions.head()

Unnamed: 0,transaction_id,terminal_id,customer_id,amount,currency,transaction_time,transaction_hour,transaction_weekday,payment_method,item_count,...,device_id,batch_id,auth_code,response_code,network_type,session_id,reference_number,trace_id,pos_entry_mode,true_mcc
0,TX00001116,T000233,C000383,272.8,GBP,2024-11-16 22:00:00,22,Saturday,MOBILE,3,...,DEV-D873,B7549,771150,0,4G,274b8351,REF16528164,ed39a783-a597-4d5c-884b-8dfa9f4f2083,7,5995
1,TX00001368,T000265,C000004,196.33,GBP,2024-01-14 07:00:00,7,Sunday,MOBILE,2,...,DEV-E604,B6001,84120,0,5G,462cd323,REF51086615,208a1fbe-60ce-447a-b700-daab3da0ca42,91,5942
2,TX00000422,T000083,C000171,172.16,USD,2024-08-09 13:00:00,13,Friday,CARD,3,...,DEV-C642,B5884,363382,0,WIFI,7fd24097,REF38935409,e1fbc784-1fcc-4816-93ab-e8244540b484,7,5651
3,TX00000413,T000078,C000450,465.8,GBP,2024-08-16 23:00:00,23,Friday,CARD,4,...,DEV-C374,B9213,479281,0,4G,3f2021da,REF45112407,73532bfb-91c0-4374-bd6f-e5321a16735a,81,5651
4,TX00000451,T000086,C000486,425.81,EUR,2024-05-13 22:00:00,22,Monday,CARD,5,...,DEV-C722,B9005,376123,0,LAN,ceab216a,REF59097411,395f6a67-83dc-4675-ab2a-3d60ea698ca3,5,5651


In [31]:
exchange_rates = {
    'USD': 1.0,
    'GBP': 1.34,
    'EUR': 1.17
}

df = transactions.merge(receipts, on='transaction_id', how='left')
df = df.merge(terminals, on='terminal_id', how='left')

df['item_price'] = (df['item_price'] * df['currency'].map(exchange_rates)).round(2)
df['amount'] = (df['amount'] * df['currency'].map(exchange_rates)).round(2)

df['max_price'] = df['item_price']
df['min_price'] = df['item_price']
df['avg_price'] = df['item_price']
df['total'] = df['item_price']

df['diff'] = df['amount'] - df['total']

df = df[['transaction_id', 'currency', 'terminal_description', 'item_name',
         'item_count', 'item_price', 'max_price', 'min_price', 'avg_price', 'total', 'amount', 'diff', 'true_mcc']]
df = df.groupby('transaction_id').agg({
    'terminal_description': 'first',
    'currency': 'first',
    'item_name': lambda x: list(x),
    'amount': 'first',
    'item_count': 'first',
    'item_price': lambda x: list(x),
    'max_price': 'max',
    'min_price': 'min',
    'avg_price': lambda x: round(x.mean(), 2),
    'total': lambda x: sum(x),
    'diff': 'first',
    'true_mcc': 'first'
}).reset_index()

df.head()

Unnamed: 0,transaction_id,terminal_description,currency,item_name,amount,item_count,item_price,max_price,min_price,avg_price,total,diff,true_mcc
0,TX00000000,order basic order basic,USD,"[unit, part]",288.02,2,"[1.81, 48.01]",48.01,1.81,24.91,49.82,286.21,5411
1,TX00000001,basic basic basic basics stapleas,EUR,[thing],76.69,1,[11.55],11.55,11.55,11.55,11.55,65.14,5411
2,TX00000002,order basic order basic,EUR,"[ahsen sernade, item item]",390.7,2,"[93.61, 91.84]",93.61,91.84,92.72,185.45,297.09,5411
3,TX00000003,tcommon order piece orders common,GBP,"[nan, cacsae item elsewhee]",509.23,2,"[108.21, 49.35]",108.21,49.35,78.78,157.56,401.02,5411
4,TX00000004,general general general general,EUR,"[part arivulet, stuff driviulet, suafron piece]",346.9,3,"[32.02, 75.57, 6.97]",75.57,6.97,38.19,114.56,314.88,5411


In [32]:
import math

In [33]:
df['item_name'] = df['item_name'].apply(
    lambda x: [i for i in x if not (isinstance(i, float) and math.isnan(i))]
)
df['item_name'] = df['item_name'].apply(lambda x: " ".join(x))

df = df.drop(columns=['transaction_id', 'currency', 'item_price'])

df.head()

Unnamed: 0,terminal_description,item_name,amount,item_count,max_price,min_price,avg_price,total,diff,true_mcc
0,order basic order basic,unit part,288.02,2,48.01,1.81,24.91,49.82,286.21,5411
1,basic basic basic basics stapleas,thing,76.69,1,11.55,11.55,11.55,11.55,65.14,5411
2,order basic order basic,ahsen sernade item item,390.7,2,93.61,91.84,92.72,185.45,297.09,5411
3,tcommon order piece orders common,cacsae item elsewhee,509.23,2,108.21,49.35,78.78,157.56,401.02,5411
4,general general general general,part arivulet stuff driviulet suafron piece,346.9,3,75.57,6.97,38.19,114.56,314.88,5411


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack

In [20]:
tfidf = TfidfVectorizer(
    ngram_range=(1,2),
    min_df=3,
    max_features=20000
)

X_term = tfidf.fit_transform(df['terminal_description'])
X_item = tfidf.fit_transform(df['item_name'])
X_numeric = df.iloc[:, 2:9].values

X = hstack([X_term, X_item, X_numeric])
y = df['true_mcc'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=101010, stratify=y)

AttributeError: 'TfidfVectorizer' object has no attribute 'fit_transfom'