In [1]:
import json
import networkx as nx 
from networkx.readwrite import json_graph
import pandas as pd
from IPython.display import display, Math, Latex
import re
import sys
import os
sys.path.append("D:\\Users\\figohjs\\Documents\\Network")
from Script.helper_v2 import *
from Script.generateHtml_v4 import generateHtml
from io import StringIO
from elasticsearch import Elasticsearch, helpers

### Connect to es

In [14]:
def multiSearch(fieldName, valueList):
    #record head and body msg for request
    searchBulk = StringIO()
    for value in valueList:
        #head
        headMsg = {'index': indexName}
        searchBulk.write('%s \n' %json.dumps(headMsg))
        #body
        bodyMsg = {"query": {'match':{fieldName:value}}}
        searchBulk.write('%s \n' %json.dumps(bodyMsg))
    
    #start multiSearch
    res = es.msearch(index = indexName, body = searchBulk.getvalue())
    
    #extract hits value
    result = [i['hits']['hits'] for i in res['responses']]
    
    return result

#connect es
es = Elasticsearch([{'host': 'localhost', 'port':9200}], timeout=30)

In [34]:
#tell info about es cluster
#es.info()

#get all indices in es
sorted(list(es.indices.get_alias('*').keys()))

['.apm-agent-configuration',
 '.kibana_1',
 '.kibana_task_manager_1',
 '.security-7',
 'account_db',
 'assignedreadonly',
 'audit_trail',
 'contact_db',
 'detail_ctr_2013',
 'detail_ctr_2014',
 'detail_ctr_2015',
 'detail_ctr_2016',
 'detail_ctr_2017',
 'detail_ctr_2018',
 'detail_ctr_2019',
 'detail_ctr_2020',
 'detail_itis',
 'detail_str',
 'distdates',
 'distratio2020',
 'distrecords',
 'entity_db',
 'itis_db',
 'leave',
 'removedrecord',
 'report_db',
 'tax']

### read data

In [15]:
#json file - subnetwork
jsonFile = "D:/Users/figohjs/Documents/Network/data/interim/2021-02-19_ScamNetwork52Names.json"
with open(jsonFile, 'r') as f:
    tempGraphDict = json.load(f)

csvFile = "D:/Users/figohjs/Documents/Network/data/raw/subject.csv"
df = pd.read_csv(csvFile)
offenseList = df['Offence'].unique()

#additional namelist for additional scam networks
nameList = []
txtFile = "D:/Users/figohjs/Documents/Network/data/raw/addScamNameList.txt"
with open(txtFile, 'r') as myfile:
    for row in myfile.readlines():
        nameList.append(re.sub('\n','',row))

In [3]:
len(tempGraphDict['nodes']), len(tempGraphDict['links'])

(48481, 96387)

In [16]:
#from yan ling - json for other scam-related networks
yanLingJsonFolder = "D:\\Users\\figohjs\\Documents\\Network\\data\\raw\\yanLing"

yanLingRecord = []
for yanLingFile in os.listdir(yanLingJsonFolder):
    with open(yanLingJsonFolder + '\\' + yanLingFile) as input_file:
        for row in input_file.readlines():
            yanLingRecord.append(json.loads(row))    

for record in yanLingRecord:
    tempGraphDict['nodes'] += record['nodes']
    tempGraphDict['links'] += record['links']

In [5]:
len(tempGraphDict['nodes']), len(tempGraphDict['links'])

(687406, 1167276)

### Overview

In [6]:
#How many person in scam in repo
scamOffense = [i for i in offenseList if re.search("Scam", str(i), re.I)]
dfScam = df[df['Offence'].isin(scamOffense)].copy()
numScamOffense = dfScam.shape[0]
print("Number of scam cases: %s"%numScamOffense)
print("Missing ID: %s"%dfScam[pd.isnull(dfScam['ID'])].shape[0])
print("Number of unique names: %s"%len(dfScam['Name'].unique()))

Number of scam cases: 73
Missing ID: 73
Number of unique names: 52


In [7]:
#clean name
dfScam['CleanName'] = dfScam['Name'].replace(to_replace = ['\(.*\)', '\.'],
                                             value = '',  regex = True).str.strip()
scamNameList  = list(dfScam['CleanName'].unique()) + nameList

#build neoId to name dict
neoIdtoNeoNameDict = {node['NeoId']:node['pName']
                     for node in tempGraphDict['nodes'] if node['label'] in ['Entity']}
neoIdtoNameDictTargetOnly = {i:j for i,j in neoIdtoNeoNameDict.items() if j in scamNameList}

#one name may have more than 1 id
print("Number of unique names found in network: %s"%len(set(neoIdtoNameDictTargetOnly.values())))

Number of unique names found in network: 58


In [8]:
print("Number of unique nodes: %s"%len(tempGraphDict['nodes']))
print("Number of links: %s"%len(tempGraphDict['links']))

Number of unique nodes: 687406
Number of links: 1167276


In [9]:
#build network graph from dict of dict
networkGraph = json_graph.node_link_graph(tempGraphDict)
#find number of subgraphs
subGraphs = nx.connected_components(networkGraph)
#filter out subgraphs with only one node
subGraphsList = [i for i in list(subGraphs) if len(i)!=1]
print("Number of subgraphs: %s"%len(list(subGraphsList)))
# assert len(list(subGraphsList)) == 43, "number of subgraphs should be 43"

Number of subgraphs: 151


In [11]:
jsonFile = "D:/Users/figohjs/Documents/Network/data/interim/2021-03-25_ScamNetwork48Names.json"
with open(jsonFile, 'w') as file:
    json.dump(tempGraphDict, file) 

### Mutiple nodes with same name

In [37]:
[i for i in tempGraphDict['nodes'] if i['NeoId'] == 8209078]

[{'uid': 'e-25e67746-6fcd-421e-974d-e5402e733792',
  'pOccupation': ['Lawyer / Barrister / Solicitor'],
  'pName': 'MOHD FAIZAL BIN AHMAD',
  'pNameList': ['MOHD FAIZAL BIN AHMAD'],
  'pCountry': ['MALAYSIA'],
  'pIdList': ['700520016143', 'A1576955'],
  'contact_list': ['072217828', '0127777828', '073337929', '072217929'],
  'pOccupationDesc': [],
  'label': 'Entity',
  'NeoId': 8209078}]

In [38]:
[i for i in tempGraphDict['nodes'] if i['NeoId'] == 8288836]

[{'uid': 'e-b37e6a4d-8fa7-4508-b06e-b8815b838d21',
  'pOccupation': [],
  'pName': 'MOHD FAIZAL BIN AHMAD',
  'pNameList': ['MOHD FAIZAL BIN AHMAD'],
  'pCountry': ['MALAYSIA'],
  'pIdList': ['840727085669'],
  'contact_list': ['0122553104', '058072323', '0166613702', '016613702'],
  'pOccupationDesc': [],
  'label': 'Entity',
  'NeoId': 8288836}]

In [39]:
[i for i in tempGraphDict['nodes'] if i['NeoId'] == 8412685]

[{'uid': 'e-f7923333-87b3-465c-b0c1-fd1805969e1d',
  'pOccupation': ['Lawyer / Barrister / Solicitor'],
  'pName': 'MOHD FAIZAL BIN AHMAD',
  'pNameList': ['MOHD FAIZAL BIN AHMAD', 'MOHD FAIZAL'],
  'pCountry': ['MALAYSIA'],
  'pIdList': ['730803145049', 'A2445711'],
  'contact_list': ['00387360515',
   '0122902314',
   '27308541',
   '0387360515',
   '0192902314',
   'MFAIZAL@UDANET.CONM'],
  'pOccupationDesc': [],
  'label': 'Entity',
  'NeoId': 8412685}]

In [39]:
[i for i in tempGraphDict['nodes'] if i['label'] == 'STR'] 

[{'uid': 'r-37f1d0c9-a54d-49ab-8b72-60a050f17c23',
  'pOffence': 'Money Laundering',
  'pStateId': 'JOHOR',
  'pTrxAmount': '763798.81',
  'pReportDate': '20/02/2014',
  'pAttempted': 'N',
  'pRecordTypeID': 'STR financial',
  'pToTrxDate': '28/01/2014',
  'pBranchId': 'JOHOR BAHRU (CITY SQUARE)',
  'pRecordID': 'AA/021/S/2014/000199',
  'pFromTrxDate': '23/02/2012',
  'label': 'STR',
  'NeoId': 14548206},
 {'uid': 'r-f21ede65-0c1b-475a-85a8-004bdc311606',
  'pOffence': 'Fraud',
  'pStateId': 'UNKNOWN',
  'pTrxAmount': '0.0',
  'pReportDate': '23/09/2019',
  'pAttempted': 'Y',
  'pRecordTypeID': 'STR financial',
  'pBranchId': 'UNKNOWN',
  'pRecordID': 'AA/023/S/2019/000406',
  'label': 'STR',
  'NeoId': 14564357},
 {'pOffence': 'Unknown',
  'pSumInsured': '50000.0',
  'pTrxAmount': '0.0',
  'pReportDate': '30/12/2013',
  'pRecordTypeID': 'STR Insurance',
  'pPremiumAmount': '1834.93',
  'pBranchId': 'UNKNOWN',
  'uid': 'r-db6abcb5-ef0c-426e-ad66-94bfb492001e',
  'pStateId': 'UNKNOWN',

### Build dict and extract info from ES

In [17]:
#build neoId to es uid dict
neoIdtoUidDict = {node['NeoId']:node['uid']
                 for node in tempGraphDict['nodes'] if node['label'] in ['Entity']}

#neoId to Str Amount Dict
neoIdtoStrAmountDict = dict([(i['NeoId'], float(i['pTrxAmount'])) for i in tempGraphDict['nodes'] if i['label'] == 'STR'])

#neoId to Ctr Amount Dict
accountInfo = [i for i in tempGraphDict['nodes'] if i['label'] == 'Account']
yearCol = ['p' + str(i) for i in range(2006, 2021)]
neoIdtoCtrAmountDict = dict([(i['NeoId'], sum([float(i[col]) for col in i.keys() if col in yearCol])) for i in accountInfo])
#build neoId to RI dict
neoIdtoRIDict = {i['NeoId']:i['pReportingIns'] for i in accountInfo}

#neoIds of str node
strNodeNeoIds = [i['NeoId'] for i in tempGraphDict['nodes'] if i['label'] == 'STR']

#details of str node
strNodeDetails = [i for i in tempGraphDict['nodes'] if i['label'] == 'STR']

#str uid to str reportID
strNeoIdtoReportID = {node['NeoId']:node['pRecordID'] for node in strNodeDetails} 

In [18]:
#get all neoId from all networks
esUidList = list(neoIdtoUidDict.values())

# indexName = 'report_db'
# queryAll = {'match_all':{}}
# queryAll = {'match':{'uid':'e-7a83f99e-17ab-4f78-96f1-3375286dbfa0'}}
# res = es.search(index = indexName, size = 1, body = {'query':queryAll})

#extract info/details for each entity in entity_db
#query es
indexName = 'entity_db'
esEntityResult = multiSearch('uid', esUidList)

#uid to occupation
uidToOccupationDict = {}
for no, i in enumerate(esEntityResult):
    if i:
        if 'occupation' in i[0]['_source'].keys():
            uidToOccupationDict[i[0]['_source']['uid']] = ','.join(i[0]['_source']['occupation'])

reportIDList = strNeoIdtoReportID.values()

#query es to get str info
indexName = 'detail_str'
esStrResult = multiSearch('record_id', reportIDList)

#str report ID to get desc
strReportIDtoDesc = {node[0]['_source']['record_id']:node[0]['_source']['suspicion_desc'] for node in esStrResult
                    if node}

In [20]:
#uid to id_list
uidToIDListDict = {}
for no, i in enumerate(esEntityResult):
    if i:
        if 'id_list' in i[0]['_source'].keys():
            uidToIDListDict[i[0]['_source']['uid']] = ','.join(i[0]['_source']['id_list'])

In [15]:
reportIDList = strNeoIdtoReportID.values()

#query es to get str info
indexName = 'detail_str'
esStrResult = multiSearch('record_id', reportIDList)

#str report ID to get desc
strReportIDtoDesc = {node[0]['_source']['record_id']:node[0]['_source']['suspicion_desc'] for node in esStrResult
                    if node}

### Build table - validation purpose

In [14]:
dfNetworkScam = pd.DataFrame()

#list of neoIds for all nodes for each network
dfNetworkScam['NeoIds'] = [i for i in subGraphsList]

#get all transaction id for each network
dfNetworkScam['TransIds'] = dfNetworkScam['NeoIds'].map(lambda x:[i['transId'] for i in tempGraphDict['links'] 
                                                                 if i['source'] in x and i['target'] in x])

In [15]:
#check if str connected to each network contains scam/fraud
def checkIfScam(neoIdArray):
    offenceList = []
    for array in neoIdArray:
        tempList = []
        strList = set(array) & set(strNodeNeoIds)
        for strId in strList:
            tempList.append([i['pOffence'] for i in strNodeDetails if i['NeoId'] == strId][0])
        offenceList.append(tempList)
    #check if scam/fraud in related str
#     print(offenceList)
    scamList = []
    for offence in offenceList:
        temp = [True for i in offence if re.search("scam|fraud", str(i), re.I)]
        if len(temp)>0:
            scamList.append(True)
        else:
            scamList.append(False)
    return scamList

dfNetworkScam['IsScamFraud'] = checkIfScam(dfNetworkScam['NeoIds'].values)

#target neoIds
dfNetworkScam['TargetNeoIds'] = dfNetworkScam['NeoIds'].map(lambda x:set([i for i in x if i in neoIdtoNameDictTargetOnly.keys()]))

#target names
dfNetworkScam['TargetNames'] = dfNetworkScam['TargetNeoIds'].map(lambda x:[neoIdtoNameDictTargetOnly[i] for i in x])

In [16]:
nameList = dfNetworkScam[dfNetworkScam['IsScamFraud']]['TargetNames']
uniqueNameList = set([j for i in nameList for j in i])
print("Number of names found in network related to scam/fraud: %s"%len(uniqueNameList))
print("Number of network related to scam/fraud: %s"%sum(dfNetworkScam['IsScamFraud']))

Number of names found in network related to scam/fraud: 48
Number of network related to scam/fraud: 20


In [21]:
#the only name left out
set(neoIdtoNameDictTargetOnly.values()).difference(uniqueNameList)

{'NORZERIN BINTI MISKAM'}

In [68]:
tempDf = dfNetworkScam[dfNetworkScam['TargetNames'].map(lambda x: True if 'NORZERIN BINTI MISKAM' in x else False)].copy()
tempDf

Unnamed: 0,NeoIds,TransIds,IsScamFraud,TargetNeoIds,TargetNames
14,"{16950017, 11893795, 16137924, 16929763, 41719...","[11566431, 11566430, 19286470, 19286469, 14859...",False,{11109587},[NORZERIN BINTI MISKAM]


In [69]:
#check if str connected to each network contains scam/fraud
def checkIfScam(neoIdArray):
    offenceList = []
    for array in neoIdArray:
        tempList = []
        strList = set(array) & set(strNodeNeoIds)
        for strId in strList:
            tempList.append([i['pOffence'] for i in strNodeDetails if i['NeoId'] == strId][0])
        offenceList.append(tempList)
    #check if scam/fraud in related str
    print(offenceList)
checkIfScam(tempDf['NeoIds'].values)

[[]]


In [77]:
for nodeId in tempDf['NeoIds'].values[0]:
    print([i for i in tempGraphDict['nodes'] if i['NeoId'] == nodeId][0])
    print('\n')

{'uid': 'c-1726b7ef-e393-4b11-8874-df6848bf0da4', 'pContactInfo': '0033723333', 'label': 'Contact', 'NeoId': 16950017}


{'uid': 'e-fa38a8c4-1114-4bc6-9625-377f12064102', 'pOccupation': ['Police Officer'], 'pName': 'MONA ARI', 'pNameList': ['MONA ARI'], 'pCountry': ['MALAYSIA'], 'pIdList': ['RF130701', '770104135358', 'K0387972'], 'contact_list': ['00355145280', '0193749693', '0197539912', '0355145231', '0176236761'], 'pOccupationDesc': [], 'label': 'Entity', 'NeoId': 11893795}


{'uid': 'c-3a32a00a-9203-43ba-8ed5-87190931d320', 'pContactInfo': '0140091546770000', 'label': 'Contact', 'NeoId': 16137924}


{'uid': 'c-c3079f9e-6043-4b9b-9666-075c86fe9b40', 'pContactInfo': '0176236761', 'label': 'Contact', 'NeoId': 16929763}


{'uid': 'a-f5ae56df-cb2a-4622-a96b-d6f769ddda66', 'pTrxTypeId': 'Aggregate', 'pTotalAmount': 89937.2, 'pAccountNo': '0740020009396', 'pStateId': 'SELANGOR', 'pAccountType': 'Savings Account', 'pReportingIns': 'AmIslamic Bank Berhad.', 'p2008': 89937.2, 'pBranchId': '

In [89]:
#extract ids
targetNeoIdList = [j for i in dfNetworkScam[dfNetworkScam['IsScamFraud']]['TargetNeoIds'] for j in i]
tempIdList = [i['pIdList'] for i in tempGraphDict['nodes'] if i['NeoId'] in targetNeoIdList]
targetIdList = [j for i in tempIdList for j in i]
#export to csv
import csv
filename = "D:/Users/figohjs/Documents/Network/data/interim/2021-03-03_ScamId.csv"
# with open(filename, "w") as f:
#     writer = csv.writer(f)
#     writer.writerows(targetNeoIdList)
pd.Series(targetIdList).to_csv(filename, index = False)

### General Characteristics

In [20]:
# dfNetworkScam = pd.DataFrame()

# #list of neoIds for all nodes for each network
# dfNetworkScam['NeoIds'] = [i for i in subGraphsList]

# #get all transaction id for each network
# dfNetworkScam['TransIds'] = dfNetworkScam['NeoIds'].map(lambda x:[i['transId'] for i in tempGraphDict['links'] 
#                                             if i['source'] in x and i['target'] in x])

#number of nodes for each network
dfNetworkScam['NumNodes'] = [len(i) for i in subGraphsList]

#number of connection for each network
dfNetworkScam['NumConnection'] = [len(i) - 1 for i in subGraphsList]

#calculate number of Str
dfNetworkScam['NumStr'] = dfNetworkScam['NeoIds'].map(lambda x:len(set(x) & set(strNodeNeoIds)))

#get strAmount for each neoIds and sum it for each network
dfNetworkScam['StrAmount'] = dfNetworkScam['NeoIds'].map(lambda x:sum([neoIdtoStrAmountDict.get(i, 0) for i in x]))

#get ctrAmount for each neoIds and sum it for each network
dfNetworkScam['CtrAmount'] = dfNetworkScam['NeoIds'].map(lambda x:sum([neoIdtoCtrAmountDict.get(i, 0) for i in x]))

#get RI for each network
dfNetworkScam['RIList'] = dfNetworkScam['NeoIds'].map(lambda x:[i for i in [neoIdtoRIDict.get(i, "") for i in x] if i!=''])

#target neoIds
dfNetworkScam['TargetNeoIds'] = dfNetworkScam['NeoIds'].map(lambda x:set([i for i in x if i in neoIdtoNameDictTargetOnly.keys()]))

#target names
dfNetworkScam['TargetNames'] = dfNetworkScam['TargetNeoIds'].map(lambda x:[neoIdtoNameDictTargetOnly[i] for i in x])

#target es uid
dfNetworkScam['TargetESuid'] = dfNetworkScam['TargetNeoIds'].map(lambda x:set([neoIdtoUidDict[i] 
                                                                   for i in x]))

#number of targets for each network based on unique id
dfNetworkScam['NumberTargetNeoId'] = dfNetworkScam['TargetNeoIds'].map(lambda x:len(set(x)))

#number of targets for each network based on unique names
dfNetworkScam['NumberTargetNames'] = dfNetworkScam['TargetNames'].map(lambda x:len(set(x)))

#check if target shares contact for each network
contactTransInfo = [i for i in tempGraphDict['links'] if i['type'] == 'HAS_CONTACT'] 
def checkTargetSharingContact(transIdArray, targetNeoIdArray):
    resultBool = []
    for i in zip(transIdArray, targetNeoIdArray):
        #loop through contact transID if it is found in transID for each row
        targetArray = [trans['target'] for trans in contactTransInfo 
                       if trans['transId'] in i[0] and trans['source'] in i[1]]
        
        #1 if has shared contact else 0
        if len(set(targetArray)) != len(targetArray):
            resultBool.append(1)
        else:
            resultBool.append(0)          
    return resultBool

dfNetworkScam['TargetSharingContact'] = checkTargetSharingContact(dfNetworkScam['TransIds'].values,
                                                                  dfNetworkScam['TargetNeoIds'].values)

In [22]:
dfNetworkScam.columns

Index(['NeoIds', 'TransIds', 'IsScamFraud', 'TargetNeoIds', 'TargetNames',
       'NumNodes', 'NumConnection', 'NumStr', 'StrAmount', 'CtrAmount',
       'TargetESuid', 'NumberTargetNeoId', 'NumberTargetNames',
       'TargetSharingContact'],
      dtype='object')

In [23]:
dfNetworkScam[dfNetworkScam['IsScamFraud']]['NumberTargetNames'].value_counts()

1     18
2      1
33     1
Name: NumberTargetNames, dtype: int64

In [44]:
# #get address from es
# esUidList = list(neoIdtoUidDict.values())

# #see if any entities (included targets) who share common address
# #query es
# indexName = 'entity_db'
# addressResult = multiSearch('uid', esUidList)

def checkTargetSharingAddress(esUidArray):
    resultArray = []
    #for each list of target uids
    for uidArray in esUidArray:
        addressDict = {}
        cleanAddressArray = []
        addressArray = [(uid, uidToAddressDict[uid]) for uid in uidArray if uid in uidToAddressDict.keys()]
        #clean and standardize each list of address for each uid
        regexSpecialChar = '\/|\,|\:|\(|\)|\?|\*|\-|\[|\]|\.|\+|\&|\=|\%|\-'
        for i in addressArray:
            cleanAddress = set([re.sub("\s+", " ", re.sub(regexSpecialChar, '', j.lower())) for j in i[1]])
            addressDict[i[0]] = cleanAddress
            cleanAddressArray.append(cleanAddress)
            
        #get count for each unique address
        addressCountDict = dict(pd.Series([j for i in cleanAddressArray for j in i]).value_counts())
        sharedAddressList = [i for i, j in addressCountDict.items() if j > 1]
        
        #if sharedAddressList is not empty
        if len(sharedAddressList)!=0:
            for sharedAddress in sharedAddressList:
                uidList = []
                for i,j in addressDict.items():
                    if sharedAddress in j:
                        uidList.append(i)
                        commonAddress = sharedAddress
            resultArray.append((uidList, 1, commonAddress))
        else:
            resultArray.append(('', 0, ''))
        
    return resultArray

#uid to address dict
#only 518 uid can get address - len(uidToAddressDict)
# uidToAddressDict = {i[0]['_source']['uid']: i[0]['_source']['detail']['ADDRESS'] for i in addressResult
#                    if 'ADDRESS' in i[0]['_source']['detail'].keys()}
uidToAddressDict = {}
for no, i in enumerate(esEntityResult):
    if i:
        if 'ADDRESS' in i[0]['_source']['detail'].keys():
            uidToAddressDict[i[0]['_source']['uid']] = i[0]['_source']['detail']['ADDRESS']
                
targetSharingAddressTuple = checkTargetSharingAddress(dfNetworkScam['TargetESuid'].values)

#1 if targets share address, else 0
dfNetworkScam['TargetSharingAddress'] = [i[1] for i in targetSharingAddressTuple]

#list of target uid sharing address
dfNetworkScam['TargetUidSharingAddress'] = [i[0] for i in targetSharingAddressTuple]

#common address if targets share
dfNetworkScam['CommonAddress'] = [i[2] for i in targetSharingAddressTuple]

#targets sharing contact/address 
dfNetworkScam['TargetSharingContactOrAddress'] = dfNetworkScam.apply(lambda x:0 if x['TargetSharingAddress'] == 0 
                                                                       and x['TargetSharingContact'] == 0
                                                                      else 1, axis = 1)



In [67]:
#get target occupation
def getTargetOccupation(esUidArray):
    occupationList = []
    for uidArray in esUidArray:
        tempList = [uidToOccupationDict[i] for i in uidArray if i in uidToOccupationDict]
        occupationList.append(','.join([i for i in tempList if i]))
    return occupationList
dfNetworkScam['TargetOccupationList'] = getTargetOccupation(dfNetworkScam['TargetESuid'].values)

In [45]:
dfNetworkScam['TargetSharingContactOrAddress'].value_counts()

0    145
1      6
Name: TargetSharingContactOrAddress, dtype: int64

In [32]:
dfNetworkScam['IsScamFraud'].value_counts()

False    131
True      20
Name: IsScamFraud, dtype: int64

In [25]:
len(addressResult)

105566

### Network perspective

In [13]:
Latex(r"""
\begin{equation}
Density = \frac{C}{n(n-1)}
\end{equation}
\begin{align}
\text{where } &\text{C - actual connection,}
\newline
&\text{n - number of nodes}
\end{align}
""")

<IPython.core.display.Latex object>

In [24]:
def calculateDensity(x):
    numCon = x['NumConnection']
    numNode = x['NumNodes']
    return numCon/(numNode*(numNode -1))

#calculate density for each network
dfNetworkScam['Density'] = dfNetworkScam.apply(calculateDensity, axis = 1)

#degree centrality
#assumption: important node has many connection d(i) = number of edges/no.of nodes - 1
neoIdToDC = nx.degree_centrality(networkGraph)

#degree centrality for each node for each network
dfNetworkScam['DCList'] = dfNetworkScam['NeoIds'].map(lambda x:[neoIdToDC.get(i,0) for i in x])

#get max degree centrality for each network
dfNetworkScam['MaxDC'] = dfNetworkScam['DCList'].map(max)

In [47]:
dfNetworkScam.columns

Index(['NeoIds', 'TransIds', 'IsScamFraud', 'TargetNeoIds', 'TargetNames',
       'NumNodes', 'NumConnection', 'NumStr', 'StrAmount', 'CtrAmount',
       'TargetESuid', 'NumberTargetNeoId', 'NumberTargetNames',
       'TargetSharingContact', 'Density', 'DCList', 'MaxDC',
       'TargetSharingAddress', 'TargetUidSharingAddress', 'CommonAddress',
       'TargetSharingContactOrAddress'],
      dtype='object')

In [69]:
#save df
filename = "D:/Users/figohjs/Documents/Network/data/interim/2021-03-15_ScamSubgraphsDF.csv"
dfNetworkScam.to_csv(filename, index = False)

### Further processing

### 2021-03-21

In [37]:
import matplotlib.pyplot as plt
from collections import defaultdict
%matplotlib inline

In [8]:
filename = "D:/Users/figohjs/Documents/Network/data/interim/2021-03-15_ScamSubgraphsDF.csv"

df = pd.read_csv(filename)

#convert str to list - export issue 
def colBeList(colArray, colName):
    result = []
    for colList in [i.replace("{", '').replace('}', '')\
                     .replace("[", '').replace(']', '')\
                    .replace("\'", '').split(',') for i in colArray]:
        if colName in ['NeoIds', 'TransIds', 'TargetNeoIds']:
            result.append([int(i.strip()) if i!='set()' and i!='' else '' for i in colList])
        else:
            result.append([i.strip() for i in colList])
    return result

colList = ['NeoIds', 'TransIds', 'TargetNeoIds']
for col in colList:
    if col in df.columns:
        df[col] = colBeList(df[col].values, col)

In [10]:
dfNetworkScam = df[df['IsScamFraud']].copy()
dfNetworkScam.shape

(20, 23)

In [11]:
dfScam.columns

Index(['NeoIds', 'TransIds', 'IsScamFraud', 'TargetNeoIds', 'TargetNames',
       'NumNodes', 'NumConnection', 'NumStr', 'StrAmount', 'CtrAmount',
       'TargetESuid', 'NumberTargetNeoId', 'NumberTargetNames',
       'TargetSharingContact', 'Density', 'DCList', 'MaxDC',
       'TargetSharingAddress', 'TargetUidSharingAddress', 'CommonAddress',
       'TargetSharingContactOrAddress', 'TargetOccupationList', 'RIList'],
      dtype='object')

In [14]:
scamNeoIdList = [j for i in dfScam['NeoIds'].values for j in i]
len(scamNeoIdList), len(set(scamNeoIdList))

(633324, 633324)

In [18]:
#get RI for each network for STR nodes
# dfNetworkScam['RIList'] = dfNetworkScam['NeoIds'].map(lambda x:[i for i in [neoIdtoRIDict.get(i, "") for i in x] 
#                                                                 if i!='' and i in strNodeNeoIds])

#uid to state dict
uidToState = {i['uid']:i['pStateId'] for i in accountInfo}

#uid to account type dict
uidToAccountType = {i['uid']:i['pAccountType'] for i in accountInfo}

#account neoid to uid dict
accountNeoIdtoUidDict = {node['NeoId']:node['uid']
                         for node in tempGraphDict['nodes'] if node['label'] in ['Account']}

In [18]:
scamUidList = [accountNeoIdtoUidDict[i] for i in scamNeoIdList if i in accountNeoIdtoUidDict.keys()]
len(scamUidList)

455843

In [20]:
accountInfoDict = {i['uid']:i for i in accountInfo}

In [23]:
#account which are found in scam networks
accountInfo2 = [accountInfoDict[i] for i in scamUidList]

In [26]:
len(accountInfo), len(accountInfo2)

(497496, 455843)

In [52]:
#neoIdtoRIDict is from accountInfo
def countByState(neoIdArray):
    resultList = []
    for idArray in neoIdArray:
        countDict = defaultdict(int)
        for i in idArray:
            if i in neoIdtoRIDict.keys():
                uid = accountNeoIdtoUidDict[i]
                countDict[uidToState[uid]]+=1
        resultList.append(dict(countDict))
    return resultList

#neoIdtoRIDict is from accountInfo
def countByTrxType(neoIdArray):
    resultList = []
    for idArray in neoIdArray:
        countDict = defaultdict(int)
        for i in idArray:
            if i in neoIdtoRIDict.keys():
                uid = accountNeoIdtoUidDict[i]
                countDict[uidToAccountType[uid]]+=1
        resultList.append(dict(countDict))
    return resultList

In [27]:
#(RI, year):amount
resultList = []
for record in accountInfo2:
    #year, amount
    tempList = [(i, record[i]) for i in record if re.search("p\d{4}", i)]
    for i in tempList:
        resultList.append([record['pStateId'], record['pReportingIns'], record['pAccountType'], i[0], i[1]])

In [29]:
dfAccount = pd.DataFrame(resultList, columns = ['State', 'RI', 'AccountType', 'Year', 'Amount'])
filename = "D:/Users/figohjs/Documents/Network/data/interim/2021-03-19_ScamAccountSummary.csv"
dfAccount.to_csv(filename, index = False)

In [53]:
dfNetworkScam['AccTotalByState'] = countByState(dfNetworkScam['NeoIds'].values)

dfNetworkScam['AccTotalByTrxType'] = countByTrxType(dfNetworkScam['NeoIds'].values)

In [60]:
filename = "D:/Users/figohjs/Documents/Network/data/interim/2021-03-17_ScamSubgraphsDF.csv"
dfNetworkScam.to_csv(filename, index = False)

### 2021-03-25

In [35]:
filename = "D:/Users/figohjs/Documents/Network/data/interim/2021-03-17_ScamSubgraphsDF.csv"

df = pd.read_csv(filename)

#convert str to list - export issue 
def colBeList(colArray, colName):
    result = []
    for colList in [i.replace("{", '').replace('}', '')\
                     .replace("[", '').replace(']', '')\
                    .replace("\'", '').split(',') for i in colArray]:
        if colName in ['NeoIds', 'TransIds', 'TargetNeoIds']:
            result.append([int(i.strip()) if i!='set()' and i!= '' else '' for i in colList])
        else:
            result.append([i.strip() for i in colList])
    return result

colList = ['NeoIds', 'TransIds', 'TargetNeoIds', 'TargetESuid']
for col in colList:
    if col in df.columns:
        df[col] = colBeList(df[col].values, col)

In [36]:
dfNetworkScam = df[df['IsScamFraud']].copy()
dfNetworkScam.shape

(20, 25)

In [18]:
#get all transaction id for each network
dfNetworkScam['ReportIds'] = dfNetworkScam['NeoIds'].map(lambda x:[strNeoIdtoReportID[i] for i in x if i in strNeoIdtoReportID.keys()])

dfNetworkScam['ReportDesc'] = dfNetworkScam['ReportIds'].map(lambda x:[strReportIDtoDesc[i] for i in x if i in strReportIDtoDesc])

In [19]:
dfNetworkScam['ReportDesc'][0][0]

'THIS IS A REPORT OF SUSPECTED COUNTERFEIT SALES AND MERCHANT FRAUD. SEONG MUN KHIEW, 2090656347420678603 OWNS A PAYPAL ACCOUNT REGISTERED IN MALAYSIA.  SEONG?S ACCOUNT WAS CREATED ON AUGUST 1, 2015, AND HAS BEEN RECEIVING PAYMENTS FOR SELLING SUSPECTED COUNTERFEIT WATCHES ON HTTP://WWW.FASHIONACCIES.CO.IN. A REVIEW ON SEONG?S ACCOUNT FOUND THAT SEONG IS SELLING G-SHOCK WATCHES ON HTTP://WWW.BRANDEDWATCHFACTORY.COM. BOTH WEBSITES ARE LINKED VIA COMPUTER COOKIES AND PAYPAL BELIEVES HTTP://WWW.FASHIONACCIES.CO.IN IS A SHELL SITE TO COVER SEONG?S SUSPICIOUS ACTIVITY. FURTHER REVIEW FOUND THAT G-SHOCK WATCH ADVERTISED ON HTTP://WWW.BRANDEDWATCHFACTORY.COM IS ALSO FOUND ON SHELL WEBSITE UNDER: WATCHES-2013-01-17. PAYPAL BELIEVES THAT SEONG IS SELLING AND RECEIVING FUNDS FOR FAKE WATCHES SOLD ON HTTP://WWW.FASHIONACCIES.CO.IN. LATER, PAYPAL RECEIVED COMPLAINTS FROM THE BUYERS FOR ITEM NOT RECEIPT AND NOT AS DESCRIBED. DUE TO LACK OF SELLER?S RESPONSE, THE FUNDS WERE REVERSED TO COVER FOR BUY

In [46]:
def getTargetAge(esUidArray):
    ageList = []
    for uidArray in esUidArray:
        tempResult = []
        tempList = [uidToIDListDict[i] for i in uidArray if i in uidToIDListDict]
        for idList in tempList:
            tempId = [i for i in idList.split(',') if len(i) == 12 and re.search('\d{2}', i[:2])]
            if tempId:
                first2Digits = tempId[0][:2]
                if first2Digits[0] == '0':
                    age = 2020 - (2000 + int(first2Digits[1]))
                elif first2Digits[0] == '1':
                    age = 2020 - (2000 + int(first2Digits[:2]))
                else:
                    age = 2020 - (1900 + int(first2Digits[:2]))
            else:
                age = ""
            tempResult.append(str(age))
        ageList.append(','.join(tempResult))
    return ageList
dfNetworkScam['TargetAgeList'] = getTargetAge(dfNetworkScam['TargetESuid'].values)

In [44]:
dfNetworkScam['ESuid'] = dfNetworkScam['NeoIds'].map(lambda x:set([neoIdtoUidDict[i] for i in x if i in neoIdtoUidDict]))

In [47]:
dfNetworkScam['AllAgeList'] = getTargetAge(dfNetworkScam['ESuid'].values)

In [48]:
filename = "D:/Users/figohjs/Documents/Network/data/interim/2021-03-25_ScamSubgraphsDF.csv"
dfNetworkScam.to_csv(filename, index = False)

In [20]:
#import stopword
stopWordList = []
txtFile = "D:\\Users\\figohjs\\Documents\\NLP\\NER\\Data\\training\\stopwords.txt"
with open(txtFile, 'r') as myfile:
    for row in myfile.readlines():
        stopWordList.append(re.sub('\n','',row))

In [109]:
filename = "D:/Users/figohjs/Documents/Network/data/interim/2021-03-23_ScamSubgraphsDF.csv"
dfNetworkScam.to_csv(filename, index = False)