In [103]:
import pandas as pd
import re
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import matplotlib.pyplot as plt

In [104]:
# Adding the data file which has the cosinr similarituy and the Clustering by Kmeans output
df_data = pd.read_csv("final_cluster_similarity.csv")
df_data

Unnamed: 0,Date,Narration,Ref_no,Value_dt,Withdrawl,Deposit,Closing Balance,source,Transaction_type,Amount,clean_narration,ClusterName
0,01/07/22,UPI-JAR SAVE DAILY-JARMYJARONLINE@YBL-YESB0YB...,0000218249800677,01/07/22,20.0,,34304.26,UPI,Withdrawl,20.00,jar save daily,0
1,02/07/22,UPI-JAR SAVE DAILY-JARMYJARONLINE@YBL-YESB0YB...,0000218327074783,02/07/22,21.0,,34283.26,UPI,Withdrawl,21.00,jar save daily,0
2,03/07/22,UPI-JAR SAVE DAILY-JARMYJARONLINE@YBL-YESB0YB...,0000218430894428,03/07/22,20.0,,34263.26,UPI,Withdrawl,20.00,jar save daily,0
3,03/07/22,UPI-NARESH PUROHIT-NP1784534@OKHDFCBANK-HDFC00...,0000218435006946,03/07/22,20.0,,34243.26,UPI,Withdrawl,20.00,naresh purohit np,3
4,04/07/22,UPI-KRISHNA VEGETABLES A-PAYTMQR281005050101OG...,0000218563429825,04/07/22,25.0,,34218.26,UPI,Withdrawl,25.00,krishna vegetables a,10
...,...,...,...,...,...,...,...,...,...,...,...,...
491,31/12/22,UPI-HAIR GLOW-9819103125@OKBIZAXIS-UTIB0000000...,0000236595133822,31/12/22,800.0,,95992.53,UPI,Withdrawl,800.00,hair glow,1
492,31/12/22,05432150000051-007188242,0000212311762603,31/12/22,,1403.39,97395.92,Others,Deposit,1403.39,,1
493,31/12/22,UPI-MAHAVIR THE HOME STO-Q80990057@YBL-YESB0YB...,0000236507176614,31/12/22,2998.0,,94397.92,UPI,Withdrawl,2998.00,mahavir the home,1
494,31/12/22,UPI-AAYUSHI SUPER SHOPPY-Q392022159@YBL-YESB0Y...,0000236509057794,31/12/22,158.0,,94239.92,UPI,Withdrawl,158.00,aayushi super shoppy,4


In [105]:
import math

In [106]:
def clean_narration(text):
    clean_text = re.sub(r"@.*", "", text)
    clean_text = re.sub(r"[^A-Za-z\s]", " ", clean_text)
    clean_text = re.sub("UPI","", clean_text)
    clean_text = re.sub(' +', ' ', clean_text)
    clean_text = clean_text.lower()
    words = re.split('\s+',clean_text)
    first_four = " ".join(words[:4])
    return first_four

In [107]:
df_data

Unnamed: 0,Date,Narration,Ref_no,Value_dt,Withdrawl,Deposit,Closing Balance,source,Transaction_type,Amount,clean_narration,ClusterName
0,01/07/22,UPI-JAR SAVE DAILY-JARMYJARONLINE@YBL-YESB0YB...,0000218249800677,01/07/22,20.0,,34304.26,UPI,Withdrawl,20.00,jar save daily,0
1,02/07/22,UPI-JAR SAVE DAILY-JARMYJARONLINE@YBL-YESB0YB...,0000218327074783,02/07/22,21.0,,34283.26,UPI,Withdrawl,21.00,jar save daily,0
2,03/07/22,UPI-JAR SAVE DAILY-JARMYJARONLINE@YBL-YESB0YB...,0000218430894428,03/07/22,20.0,,34263.26,UPI,Withdrawl,20.00,jar save daily,0
3,03/07/22,UPI-NARESH PUROHIT-NP1784534@OKHDFCBANK-HDFC00...,0000218435006946,03/07/22,20.0,,34243.26,UPI,Withdrawl,20.00,naresh purohit np,3
4,04/07/22,UPI-KRISHNA VEGETABLES A-PAYTMQR281005050101OG...,0000218563429825,04/07/22,25.0,,34218.26,UPI,Withdrawl,25.00,krishna vegetables a,10
...,...,...,...,...,...,...,...,...,...,...,...,...
491,31/12/22,UPI-HAIR GLOW-9819103125@OKBIZAXIS-UTIB0000000...,0000236595133822,31/12/22,800.0,,95992.53,UPI,Withdrawl,800.00,hair glow,1
492,31/12/22,05432150000051-007188242,0000212311762603,31/12/22,,1403.39,97395.92,Others,Deposit,1403.39,,1
493,31/12/22,UPI-MAHAVIR THE HOME STO-Q80990057@YBL-YESB0YB...,0000236507176614,31/12/22,2998.0,,94397.92,UPI,Withdrawl,2998.00,mahavir the home,1
494,31/12/22,UPI-AAYUSHI SUPER SHOPPY-Q392022159@YBL-YESB0Y...,0000236509057794,31/12/22,158.0,,94239.92,UPI,Withdrawl,158.00,aayushi super shoppy,4


In [108]:
vectorizer = TfidfVectorizer(sublinear_tf= True, min_df=2, norm='l2', ngram_range=(1, 3))
documents = df_data["clean_narration"]
vect_text=vectorizer.fit_transform(documents)

In [109]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model=LatentDirichletAllocation(n_components=15,learning_method='online',random_state=42,max_iter=10) 
lda_top=lda_model.fit_transform(vect_text)
print(lda_top.shape)

(496, 15)


In [110]:
vectorizer.get_feature_names_out()

array(['aayushi', 'aayushi super', 'aayushi super shoppy', 'accenture',
       'accenture salary', 'accenture solutions',
       'accenture solutions pvt', 'aeml', 'aeml awsp', 'airtel', 'akash',
       'akash bhatt', 'and', 'app', 'app paytm', 'aug', 'autopay',
       'autopay si', 'autopay si tad', 'awsp', 'bharatpe', 'bhatt',
       'billdeskpg', 'bofa', 'bofa mm', 'bofa mm accenture',
       'capitalised', 'ccavenues', 'ccavenues ccavenues', 'center',
       'chauhan', 'chemist', 'chemist paytmqr', 'chheda', 'chouhan',
       'chouhan chauhan', 'club', 'cms', 'com', 'country',
       'country delight', 'country delight paytm', 'cr', 'cr bofa',
       'cr bofa mm', 'cr kkbk', 'cr kkbk eclerx', 'cred', 'cred club',
       'credclub', 'credclub cred', 'credclub cred club', 'credit',
       'credit interest', 'credit interest capitalised', 'daily', 'dairy',
       'dc', 'dc autopay', 'dc autopay si', 'dc emi', 'dc emi loan',
       'dec', 'delight', 'delight paytm', 'departmental',
   

In [111]:
len(vectorizer.get_feature_names_out())

282

In [112]:
len(lda_model.components_[2])

282

In [113]:
len(lda_model.components_)

15

In [114]:
topic_df = pd.DataFrame(columns=['Topic','Topic_desc'])

In [115]:
def print_topics(model, vectorizer, top_n=5): 
    for idx, topic in enumerate(model.components_): 
        print("Topic %d:" % (idx)) 
        #topic_df["Topic"].iloc[idx]=idx
        print([(vectorizer.get_feature_names_out()[i], topic[i]) for i in topic.argsort()[:-top_n - 1:-1]])
        #topic_df["Topic_desc"].iloc[idx]=[(vectorizer.get_feature_names_out()[i], topic[i]) for i in topic.argsort()[:-top_n - 1:-1]]
        topic_df.loc[idx]=[idx,[(vectorizer.get_feature_names_out()[i], topic[i]) for i in topic.argsort()[:-top_n - 1:-1]]]
print_topics(lda_model, vectorizer)

Topic 0:
[('kopal chouhan chauhan', 2.3305858853071073), ('chouhan', 2.3286664002640896), ('chouhan chauhan', 2.3285237587338017), ('chauhan', 2.327674918586083), ('kopal chouhan', 2.326920463024618)]
Topic 1:
[('krishna vegetables', 2.868090070716998), ('krishna', 2.867875599541343), ('vegetables', 2.4837170548888774), ('neft', 1.218013939404935), ('cr', 1.2163746848135881)]
Topic 2:
[('gpay', 6.692672965121313), ('raj', 4.287809482243271), ('vegetables', 3.3788291843535827), ('cred', 3.2777169725565467), ('credclub', 3.2641865049925283)]
Topic 3:
[('and', 3.24575679329272), ('pos', 1.8609377666494087), ('dairy', 1.8381101332622511), ('akash bhatt', 1.6746192627724776), ('bhatt', 1.6706289674378851)]
Topic 4:
[('aayushi super shoppy', 7.068495523819026), ('aayushi', 7.06838930557633), ('aayushi super', 7.06809738797555), ('super shoppy', 7.0680503548048215), ('super', 7.0680361727358765)]
Topic 5:
[('chheda', 1.464212619426527), ('kirit', 1.4640073987714586), ('jar', 0.140604583611967

In [116]:
topic_df

Unnamed: 0,Topic,Topic_desc
0,0,"[(kopal chouhan chauhan, 2.3305858853071073), ..."
1,1,"[(krishna vegetables, 2.868090070716998), (kri..."
2,2,"[(gpay, 6.692672965121313), (raj, 4.2878094822..."
3,3,"[(and, 3.24575679329272), (pos, 1.860937766649..."
4,4,"[(aayushi super shoppy, 7.068495523819026), (a..."
5,5,"[(chheda, 1.464212619426527), (kirit, 1.464007..."
6,6,"[(purohit, 8.247507413085748), (purohit np, 8...."
7,7,"[(daily, 71.31363994705612), (jar, 70.52432117..."
8,8,"[(juice, 13.3482250618993), (gupta juice cente..."
9,9,"[(com, 2.204106089334057), (uber india, 0.8802..."


In [117]:
df_data["doc_topic"]=0
df_data["topic_desc"]=""

In [118]:
for i, doc in enumerate(vect_text):
    topic = lda_model.transform(doc)
    topic, _ = topic.argmax(), topic.max()
    df_data["doc_topic"].iloc[i]=topic
    df_data["topic_desc"].iloc[i]=topic_df['Topic_desc'].loc[topic_df['Topic'] == topic]
    print(f"Document {i} belongs to topic {topic}")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data["doc_topic"].iloc[i]=topic
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data["topic_desc"].iloc[i]=topic_df['Topic_desc'].loc[topic_df['Topic'] == topic]


Document 0 belongs to topic 7
Document 1 belongs to topic 7
Document 2 belongs to topic 7
Document 3 belongs to topic 6
Document 4 belongs to topic 1
Document 5 belongs to topic 7
Document 6 belongs to topic 7
Document 7 belongs to topic 7
Document 8 belongs to topic 7
Document 9 belongs to topic 7
Document 10 belongs to topic 0
Document 11 belongs to topic 7
Document 12 belongs to topic 14
Document 13 belongs to topic 4
Document 14 belongs to topic 7
Document 15 belongs to topic 7
Document 16 belongs to topic 1
Document 17 belongs to topic 7
Document 18 belongs to topic 7
Document 19 belongs to topic 6
Document 20 belongs to topic 0
Document 21 belongs to topic 12
Document 22 belongs to topic 12
Document 23 belongs to topic 7
Document 24 belongs to topic 0
Document 25 belongs to topic 4
Document 26 belongs to topic 7
Document 27 belongs to topic 6
Document 28 belongs to topic 6
Document 29 belongs to topic 7
Document 30 belongs to topic 7
Document 31 belongs to topic 7
Document 32 bel

In [119]:
df_data

Unnamed: 0,Date,Narration,Ref_no,Value_dt,Withdrawl,Deposit,Closing Balance,source,Transaction_type,Amount,clean_narration,ClusterName,doc_topic,topic_desc
0,01/07/22,UPI-JAR SAVE DAILY-JARMYJARONLINE@YBL-YESB0YB...,0000218249800677,01/07/22,20.0,,34304.26,UPI,Withdrawl,20.00,jar save daily,0,7,"[[(daily, 71.31363994705612), (jar, 70.5243211..."
1,02/07/22,UPI-JAR SAVE DAILY-JARMYJARONLINE@YBL-YESB0YB...,0000218327074783,02/07/22,21.0,,34283.26,UPI,Withdrawl,21.00,jar save daily,0,7,"[[(daily, 71.31363994705612), (jar, 70.5243211..."
2,03/07/22,UPI-JAR SAVE DAILY-JARMYJARONLINE@YBL-YESB0YB...,0000218430894428,03/07/22,20.0,,34263.26,UPI,Withdrawl,20.00,jar save daily,0,7,"[[(daily, 71.31363994705612), (jar, 70.5243211..."
3,03/07/22,UPI-NARESH PUROHIT-NP1784534@OKHDFCBANK-HDFC00...,0000218435006946,03/07/22,20.0,,34243.26,UPI,Withdrawl,20.00,naresh purohit np,3,6,"[[(purohit, 8.247507413085748), (purohit np, 8..."
4,04/07/22,UPI-KRISHNA VEGETABLES A-PAYTMQR281005050101OG...,0000218563429825,04/07/22,25.0,,34218.26,UPI,Withdrawl,25.00,krishna vegetables a,10,1,"[[(krishna vegetables, 2.868090070716998), (kr..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
491,31/12/22,UPI-HAIR GLOW-9819103125@OKBIZAXIS-UTIB0000000...,0000236595133822,31/12/22,800.0,,95992.53,UPI,Withdrawl,800.00,hair glow,1,14,"[[(kumar, 1.915175072264887), (money, 1.724489..."
492,31/12/22,05432150000051-007188242,0000212311762603,31/12/22,,1403.39,97395.92,Others,Deposit,1403.39,,1,0,"[[(kopal chouhan chauhan, 2.3305858853071073),..."
493,31/12/22,UPI-MAHAVIR THE HOME STO-Q80990057@YBL-YESB0YB...,0000236507176614,31/12/22,2998.0,,94397.92,UPI,Withdrawl,2998.00,mahavir the home,1,0,"[[(kopal chouhan chauhan, 2.3305858853071073),..."
494,31/12/22,UPI-AAYUSHI SUPER SHOPPY-Q392022159@YBL-YESB0Y...,0000236509057794,31/12/22,158.0,,94239.92,UPI,Withdrawl,158.00,aayushi super shoppy,4,4,"[[(aayushi super shoppy, 7.068495523819026), (..."


In [120]:
df_manual_cat = pd.read_csv("master.csv")
df_manual_cat

Unnamed: 0,Extract_text,Category
0,JAR,JAR
1,RD INSTALLMENT,RD INSTALLMENT
2,DC EMI LOAN,DC EMI LOAN
3,AAYUSHI,AAYUSHI SHOP
4,JUICE,JUICE
5,AIRTEL,AIRTEL
6,JIOFIBER,JIOFIBER
7,VEGETABLES,VEGETABLES
8,COUNTRY DELIGHT,COUNTRY DELIGHT
9,HUNGERBOX,HUNGERBOX


In [121]:
def manual_category(text):
    text = text.lower()
    for i in range(0,df_manual_cat.shape[0]):
        text1 = df_manual_cat["Extract_text"].iloc[i]
        text1=text1.lower()
        if text1 in text:
            return df_manual_cat["Category"].iloc[i]
    return ""

In [122]:
df_data["Manual_category"]=df_data["clean_narration"].apply(manual_category)

In [123]:
df_data

Unnamed: 0,Date,Narration,Ref_no,Value_dt,Withdrawl,Deposit,Closing Balance,source,Transaction_type,Amount,clean_narration,ClusterName,doc_topic,topic_desc,Manual_category
0,01/07/22,UPI-JAR SAVE DAILY-JARMYJARONLINE@YBL-YESB0YB...,0000218249800677,01/07/22,20.0,,34304.26,UPI,Withdrawl,20.00,jar save daily,0,7,"[[(daily, 71.31363994705612), (jar, 70.5243211...",JAR
1,02/07/22,UPI-JAR SAVE DAILY-JARMYJARONLINE@YBL-YESB0YB...,0000218327074783,02/07/22,21.0,,34283.26,UPI,Withdrawl,21.00,jar save daily,0,7,"[[(daily, 71.31363994705612), (jar, 70.5243211...",JAR
2,03/07/22,UPI-JAR SAVE DAILY-JARMYJARONLINE@YBL-YESB0YB...,0000218430894428,03/07/22,20.0,,34263.26,UPI,Withdrawl,20.00,jar save daily,0,7,"[[(daily, 71.31363994705612), (jar, 70.5243211...",JAR
3,03/07/22,UPI-NARESH PUROHIT-NP1784534@OKHDFCBANK-HDFC00...,0000218435006946,03/07/22,20.0,,34243.26,UPI,Withdrawl,20.00,naresh purohit np,3,6,"[[(purohit, 8.247507413085748), (purohit np, 8...",DAIRY
4,04/07/22,UPI-KRISHNA VEGETABLES A-PAYTMQR281005050101OG...,0000218563429825,04/07/22,25.0,,34218.26,UPI,Withdrawl,25.00,krishna vegetables a,10,1,"[[(krishna vegetables, 2.868090070716998), (kr...",VEGETABLES
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
491,31/12/22,UPI-HAIR GLOW-9819103125@OKBIZAXIS-UTIB0000000...,0000236595133822,31/12/22,800.0,,95992.53,UPI,Withdrawl,800.00,hair glow,1,14,"[[(kumar, 1.915175072264887), (money, 1.724489...",HAIR
492,31/12/22,05432150000051-007188242,0000212311762603,31/12/22,,1403.39,97395.92,Others,Deposit,1403.39,,1,0,"[[(kopal chouhan chauhan, 2.3305858853071073),...",
493,31/12/22,UPI-MAHAVIR THE HOME STO-Q80990057@YBL-YESB0YB...,0000236507176614,31/12/22,2998.0,,94397.92,UPI,Withdrawl,2998.00,mahavir the home,1,0,"[[(kopal chouhan chauhan, 2.3305858853071073),...",
494,31/12/22,UPI-AAYUSHI SUPER SHOPPY-Q392022159@YBL-YESB0Y...,0000236509057794,31/12/22,158.0,,94239.92,UPI,Withdrawl,158.00,aayushi super shoppy,4,4,"[[(aayushi super shoppy, 7.068495523819026), (...",AAYUSHI SHOP


In [125]:
df_data.to_csv("vectorization_lda_output.csv")