In [244]:
import keras_vggface
import tensorflow as tf
import PIL.Image as Image
import numpy as np
import mtcnn
import scipy.spatial.distance as sp
import matplotlib.pyplot as plt
from mtcnn import MTCNN
from keras_vggface import utils
from keras_vggface.vggface import VGGFace
import os
import pandas as pd
from itertools import chain
os.environ['KMP_DUPLICATE_LIB_OK']='True'

# Facial Similarity Analysis


First of all, I'm using MTCNN (Multi-task Cascaded Neural Network) in order to predict the location of the face in the image so that we would be able to crop only the face, so that we don't let the background to interfere with our similarity calculation later on. Then, we use VGGFace (with the model resnet50) in order to extract the features of the facial image in the form of an array.

In [245]:
resnet50_features = VGGFace(model='resnet50', include_top=False, input_shape=(224, 224, 3), # use vggface model that is used by resnet50
                                pooling='avg')

In [246]:
def extract_face(filename, required_size=(224, 224)):
    pixels = plt.imread(filename)
    detector = MTCNN()
    results = detector.detect_faces(pixels) # detect face inside a box
    x1, y1, width, height = results[0]['box'] 
    x2, y2 = x1 + width, y1 + height
    face = pixels[y1:y2, x1:x2]
    image = Image.fromarray(face)
    image = image.resize(required_size) # resize to 224,224 because vgg model requires it to be that way 
    face_array = np.asarray(image)
#     plt.imshow(face_array)
#     plt.show() # to show the cropped image
    pixels = face_array.astype('float32')
    pixels = tf.expand_dims(pixels, axis=0)
    samples = utils.preprocess_input(pixels, version=2) 
    features = resnet50_features.predict(samples)
    return features

We will first use the images of male actors before proceeding with female actresses.

In [247]:
path = r"/Users/valencialie/Desktop/CZ1016_DS2/kdrama/facial similarity/Images/Male"
# change the working directory to the path where the images are located
# please change accordingly
os.chdir(path)

In [248]:
actors = []

In [249]:
# creates a ScandirIterator aliased as files
with os.scandir(path) as files:
  # loops through each file in the directory
    for file in files:
        if file.name.endswith('.jpeg'):
          # adds only the image files ending with jpeg to the list
            actors.append(file.name)

In [250]:
len(actors)

576

Next, we will insert all the names of the actors as well as the extracted features into a dictionary. 

In [251]:
data = {}

for actor in actors:
    try:
        feat = extract_face(actor)
        data[actor] = feat
    except:
        data[actor] = "Blank"

In [252]:
len(data)

576

Before moving forward, we ensure that all the features of the actors are extracted well (meaning there isn't going to be a "Blank" inside the dictionary value for every key inside the dictionary).

In [253]:
keys = list(data.keys())

In [254]:
values = list(data.values())

In [255]:
d = { label: value for label, value in zip(keys, values) }

In [256]:
df = pd.DataFrame(d.items(), columns=['Actor', 'Array'])

In [257]:
df[df.Array == "Blank"]

  result = libops.scalar_compare(x.ravel(), y, op)


Unnamed: 0,Actor,Array


Next, we will calculate the cosine similarity between all the extracted features. We will take out the top 10 most similar looking actor (as well as their corresponding extracted features) to every actors in this dataframe by using 2 for loops. 

In [258]:
dist_list = []
dist_list2 = []
index = []
zipped_lists = []
sorted_zipped_lists = []
top10 = []
overall = []

In [259]:
for j in range(0,576):
    for i in range(0,576):
        if i == j:
            continue
        else:
            dist = sp.cosine(values[i], values[j])
            dist_list.append(dist)
            dist_list2.append(dist)
            index.append(i)
            dist_list.sort()
            top10 = dist_list[:10]
            zipped_lists = zip(dist_list2, index)
            sorted_zipped_lists = sorted(zipped_lists)
            sorted_list1 = [element for _, element in sorted_zipped_lists]
            top10.extend(sorted_list1[:10])
    overall.append(top10)
    dist_list = []
    dist_list2 = []
    index = []
    zipped_lists = []
    sorted_list1 =[]
    top10 = []

In [260]:
df1 = pd.DataFrame(overall,columns=['Least_Array_Diff', '2ndLeast_Array_Diff','3rdLeast_Array_Diff', '4thLeast_Array_Diff', '5thLeast_Array_Diff', '6thLeast_Array_Diff', '7thLeast_Array_Diff','8thLeast_Array_Diff', '9thLeast_Array_Diff', '10thLeast_Array_Diff','1stMatch', '2ndMatch', '3rdMatch', '4thMatch', '5thMatch', '6thMatch', '7thMatch', '8thMatch', '9thMatch', '10thMatch'])

In [261]:
df2 = pd.concat([df, df1], axis = 1)

In [262]:
df2["Actor"] = df2["Actor"].str.replace("_", "")

In [263]:
df2["Actor"] = df2["Actor"].str.replace(".jpeg", "", regex = True)

In [264]:
bestmatch = []

for i in range(0,576):
    bestmatch.append(df2.iloc[df2['1stMatch'][i], 0])
    
secondmatch = []

for i in range(0,576):
    secondmatch.append(df2.iloc[df2['2ndMatch'][i], 0])
    
thirdmatch = []

for i in range(0,576):
    thirdmatch.append(df2.iloc[df2['3rdMatch'][i], 0])
    
fourthmatch = []

for i in range(0,576):
    fourthmatch.append(df2.iloc[df2['4thMatch'][i], 0])

fifthmatch = []

for i in range(0,576):
    fifthmatch.append(df2.iloc[df2['5thMatch'][i], 0])
    
sixthmatch = []

for i in range(0,576):
    sixthmatch.append(df2.iloc[df2['6thMatch'][i], 0])
    
seventhmatch = []

for i in range(0,576):
    seventhmatch.append(df2.iloc[df2['7thMatch'][i], 0])
    
eighthmatch = []

for i in range(0,576):
    eighthmatch.append(df2.iloc[df2['8thMatch'][i], 0])
    
ninthmatch = []

for i in range(0,576):
    ninthmatch.append(df2.iloc[df2['9thMatch'][i], 0])
    
tenthmatch = []

for i in range(0,576):
    tenthmatch.append(df2.iloc[df2['10thMatch'][i], 0])

df3 = pd.DataFrame(bestmatch,columns=['Best_Match_Actor'])
df4 = pd.DataFrame(secondmatch,columns=['Second_Match_Actor'])
df5 = pd.DataFrame(thirdmatch,columns=['Third_Match_Actor'])
df6 = pd.DataFrame(fourthmatch,columns=['Fourth_Match_Actor'])
df7 = pd.DataFrame(fifthmatch,columns=['Fifth_Match_Actor'])
df8 = pd.DataFrame(sixthmatch,columns=['Sixth_Match_Actor'])
df9 = pd.DataFrame(seventhmatch,columns=['Seventh_Match_Actor'])
df10 = pd.DataFrame(eighthmatch,columns=['Eighth_Match_Actor'])
df11 = pd.DataFrame(ninthmatch,columns=['Ninth_Match_Actor'])
df12 = pd.DataFrame(tenthmatch,columns=['Tenth_Match_Actor'])

df13 = pd.concat([df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12], axis = 1)

In [265]:
df13.head()

Unnamed: 0,Actor,Array,Least_Array_Diff,2ndLeast_Array_Diff,3rdLeast_Array_Diff,4thLeast_Array_Diff,5thLeast_Array_Diff,6thLeast_Array_Diff,7thLeast_Array_Diff,8thLeast_Array_Diff,...,Best_Match_Actor,Second_Match_Actor,Third_Match_Actor,Fourth_Match_Actor,Fifth_Match_Actor,Sixth_Match_Actor,Seventh_Match_Actor,Eighth_Match_Actor,Ninth_Match_Actor,Tenth_Match_Actor
0,LeeKyuHan,"[[0.0, 0.06436636, 0.0, 0.0, 2.1942368, 0.1459...",0.296106,0.317963,0.331765,0.346518,0.346633,0.353566,0.358768,0.36126,...,YangKyungWon,YoonKyunSang,LeeDongHae,OhDaeHwan,LeeYiKyung,OhJungSe,GongYoo,ParkEunSeok,HyunBin,JaeHee
1,ImShiWan,"[[0.0, 0.0, 1.957572, 0.0, 1.343608, 2.250862,...",0.34561,0.361965,0.364178,0.371152,0.376647,0.382276,0.38349,0.385891,...,LeeTaeRi,LeeJongWon,LeeSeoWon,YoonSunWoo,JiIlJoo,SongJoongKi,JungJinWoon,ParkHaeJin,HanGiChan,KimJungHyun
2,JaeHee,"[[0.0, 1.1606871, 0.0, 0.8187124, 0.12278407, ...",0.294929,0.308964,0.31913,0.348895,0.359478,0.359915,0.361896,0.362959,...,SeoJiHoon,ParkEunSeok,YangKyungWon,JiChangWook,LeeTaeRi,KimWooBin,JooSangWook,JungGunJoo,LeeYiKyung,LeeKyuHan
3,KimSungOh,"[[0.0744203, 0.27144027, 3.1494617, 0.13220347...",0.349409,0.376483,0.39936,0.411211,0.412942,0.426894,0.435751,0.441681,...,ParkEunSeok,YangKyungWon,LeeJaeYoon,OhnJooWan,ChoiDaniel,JungGunJoo,KimJiHoon,LeeDongHae,ShinSeungHwan,WooJungKook
4,JoHeeBong,"[[2.405822, 0.25564858, 0.0, 0.62448454, 0.878...",0.361091,0.392543,0.396306,0.419863,0.427464,0.432157,0.437181,0.438818,...,LeeSungJae,ChunHoJin,KimMinSang,KimWonHae,ChunJungMyung,ParkHyukKwon,LeeSungMin,LeeByungHun,LeeDaeYeon,ParkGeonRak


Next, we will do the same exact steps to all the female actresses as well.

In [33]:
actresses = []

In [34]:
path = r"/Users/valencialie/Desktop/CZ1016_DS2/kdrama/facial similarity/Images/Female"
# change the working directory to the path where the images are located
os.chdir(path)

In [35]:
# creates a ScandirIterator aliased as files
with os.scandir(path) as files:
  # loops through each file in the directory
    for file in files:
        if file.name.endswith('.jpeg'):
          # adds only the image files ending with jpeg to the list
            actresses.append(file.name)

In [36]:
len(actresses)

474

In [37]:
dataF = {}

for actress in actresses:
    try:
        feat = extract_face(actress)
        dataF[actress] = feat
    except:
        dataF[actress] = "Blank"

In [38]:
label = list(dataF.keys())
values = list(dataF.values())

In [39]:
f = { label: value for label, value in zip(label, values) }

In [40]:
dF = pd.DataFrame(f.items(), columns=['Actor', 'Array'])

In [41]:
dF[dF.Array == "Blank"]

  result = libops.scalar_compare(x.ravel(), y, op)


Unnamed: 0,Actor,Array


In [42]:
dist_list = []
dist_list2 = []
index = []
zipped_lists = []
sorted_zipped_lists = []
top10 = []
overall = []

In [43]:
for j in range(0,474):
    for i in range(0,474):
        if i == j:
            continue
        else:
            dist = sp.cosine(values[i], values[j])
            dist_list.append(dist)
            dist_list2.append(dist)
            index.append(i)
            dist_list.sort()
            top10 = dist_list[:10]
            zipped_lists = zip(dist_list2, index)
            sorted_zipped_lists = sorted(zipped_lists)
            sorted_list1 = [element for _, element in sorted_zipped_lists]
            top10.extend(sorted_list1[:10])
    overall.append(top10)
    dist_list = []
    dist_list2 = []
    index = []
    zipped_lists = []
    sorted_list1 =[]
    top10 = []

In [44]:
dF1 = pd.DataFrame(overall,columns=['Least_Array_Diff', '2ndLeast_Array_Diff','3rdLeast_Array_Diff', '4thLeast_Array_Diff', '5thLeast_Array_Diff', '6thLeast_Array_Diff', '7thLeast_Array_Diff','8thLeast_Array_Diff', '9thLeast_Array_Diff', '10thLeast_Array_Diff','1stMatch', '2ndMatch', '3rdMatch', '4thMatch', '5thMatch', '6thMatch', '7thMatch', '8thMatch', '9thMatch', '10thMatch'])

In [45]:
dF2 = pd.concat([dF, dF1], axis = 1)

In [46]:
dF2["Actor"] = dF2["Actor"].str.replace("_", "")

In [47]:
dF2["Actor"] = dF2["Actor"].str.replace(".jpeg", "", regex = True)

In [49]:
bestmatch = []

for i in range(0,474):
    bestmatch.append(dF2.iloc[dF2['1stMatch'][i], 0])
    
secondmatch = []

for i in range(0,474):
    secondmatch.append(dF2.iloc[dF2['2ndMatch'][i], 0])
    
thirdmatch = []

for i in range(0,474):
    thirdmatch.append(dF2.iloc[dF2['3rdMatch'][i], 0])
    
fourthmatch = []

for i in range(0,474):
    fourthmatch.append(dF2.iloc[dF2['4thMatch'][i], 0])

fifthmatch = []

for i in range(0,474):
    fifthmatch.append(dF2.iloc[dF2['5thMatch'][i], 0])
    
sixthmatch = []

for i in range(0,474):
    sixthmatch.append(dF2.iloc[dF2['6thMatch'][i], 0])
    
seventhmatch = []

for i in range(0,474):
    seventhmatch.append(dF2.iloc[dF2['7thMatch'][i], 0])
    
eighthmatch = []

for i in range(0,474):
    eighthmatch.append(dF2.iloc[dF2['8thMatch'][i], 0])
    
ninthmatch = []

for i in range(0,474):
    ninthmatch.append(dF2.iloc[dF2['9thMatch'][i], 0])
    
tenthmatch = []

for i in range(0,474):
    tenthmatch.append(dF2.iloc[dF2['10thMatch'][i], 0])

dF3 = pd.DataFrame(bestmatch,columns=['Best_Match_Actor'])
dF4 = pd.DataFrame(secondmatch,columns=['Second_Match_Actor'])
dF5 = pd.DataFrame(thirdmatch,columns=['Third_Match_Actor'])
dF6 = pd.DataFrame(fourthmatch,columns=['Fourth_Match_Actor'])
dF7 = pd.DataFrame(fifthmatch,columns=['Fifth_Match_Actor'])
dF8 = pd.DataFrame(sixthmatch,columns=['Sixth_Match_Actor'])
dF9 = pd.DataFrame(seventhmatch,columns=['Seventh_Match_Actor'])
dF10 = pd.DataFrame(eighthmatch,columns=['Eighth_Match_Actor'])
dF11 = pd.DataFrame(ninthmatch,columns=['Ninth_Match_Actor'])
dF12 = pd.DataFrame(tenthmatch,columns=['Tenth_Match_Actor'])

dF13 = pd.concat([dF2, dF3, dF4, dF5, dF6, dF7, dF8, dF9, dF10, dF11, dF12], axis = 1)

In [50]:
dF13.head()

Unnamed: 0,Actor,Array,Least_Array_Diff,2ndLeast_Array_Diff,3rdLeast_Array_Diff,4thLeast_Array_Diff,5thLeast_Array_Diff,6thLeast_Array_Diff,7thLeast_Array_Diff,8thLeast_Array_Diff,...,Best_Match_Actor,Second_Match_Actor,Third_Match_Actor,Fourth_Match_Actor,Fifth_Match_Actor,Sixth_Match_Actor,Seventh_Match_Actor,Eighth_Match_Actor,Ninth_Match_Actor,Tenth_Match_Actor
0,JungYooJin,"[[0.0, 0.2543437, 0.0, 4.4913435, 3.635107, 0....",0.275907,0.301436,0.301582,0.313293,0.317468,0.321232,0.321265,0.321366,...,KwonNaRa,SongHyeKyo,YooDaIn,SonYeJin,ShinSeKyung,LeeHaNa,KimJiWon,KimYoonHye,HanBoReum,ChoYeoJung
1,SeoYiAhn,"[[1.1509761, 0.047182605, 2.0433936, 0.5478225...",0.305233,0.340385,0.346528,0.360276,0.377073,0.377606,0.380573,0.385373,...,SonHwaRyung,KimHyunJoo,LeeBoYoung,LeeHwiHyang,GoYoonJung,GilEunHye,KimTaeHee,SeoJiHye,JungRyeoWon,HanSunHwa
2,KangYeNa,"[[1.9589509, 0.0, 0.40266016, 0.42564636, 2.02...",0.371118,0.399121,0.402775,0.414845,0.419751,0.429579,0.432363,0.434435,...,GoAhRa,Uee,SaHyunJin,LeeShiYoung,ImSungEon,HanHyoJoo,OhNaRa,LeeHeeWon,JangYoungNam,SeoWoo
3,KimHyunSoo,"[[0.0, 0.14416984, 4.598352, 0.061391816, 5.13...",0.305612,0.317215,0.319267,0.340414,0.34388,0.348375,0.352641,0.355292,...,KimSoHyun,JungDaBin,RyuHwaYoung,YoonEunHye,NaEunSaem,GoAhSung,SonDamBi,HanJiEun,ParkJooMi,HanGroo
4,YooHyeRi,"[[0.47068676, 0.82373685, 1.8157511, 2.4260333...",0.401623,0.409698,0.411395,0.416729,0.426655,0.426857,0.43222,0.441863,...,ShimHyeJin,KimSunAh,YooSeoJin,KimMinSeo,ChoiSooRin,KimJeeSoo,ChaeJungAn,JinYeJu,HanBoReum,KimNamJoo


# Sanity Check

Before proceeding, we're going to test our model with 10 pictures of the same actor as well as 10 pictures of the same actress. If the cosine distance calculated between these 10 pictures are relatively low, we can assume that this model is doing well because it is able to identiy a person resembling another person well (In this case, it'll be the same person, just different photos).

In [143]:
baesuzy = []

In [144]:
baesuzy.append(extract_face("baesuzy1.jpg"))

baesuzy.append(extract_face("baesuzy2.jpg"))

baesuzy.append(extract_face("baesuzy3.jpg"))

baesuzy.append(extract_face("baesuzy4.jpg"))

baesuzy.append(extract_face("baesuzy5.jpg"))

baesuzy.append(extract_face("baesuzy6.jpg"))

baesuzy.append(extract_face("baesuzy7.jpg"))

baesuzy.append(extract_face("baesuzy8.jpg"))

baesuzy.append(extract_face("baesuzy9.jpg"))

baesuzy.append(extract_face("baesuzy10.jpg"))

In [145]:
cosinebae = []

In [147]:
for i in baesuzy:
    cosinebae.append(sp.cosine(i, i+1))
    

In [152]:
def average(lst):
    return sum(lst) / len(lst)

In [153]:
average(cosinebae)

0.04922724366188049

Because the average is very low compared to when we are comparing between 2 different people's photo (10 times smaller in terms of distance!), we can say that the model worked well. Now we will try again for another male actor's photos.

In [168]:
seoinguk = []

In [169]:
seoinguk.append(extract_face("seoinguk1.jpg"))

seoinguk.append(extract_face("seoinguk2.jpg"))

seoinguk.append(extract_face("seoinguk3.jpg"))

seoinguk.append(extract_face("seoinguk4.jpg"))

seoinguk.append(extract_face("seoinguk5.jpg"))

seoinguk.append(extract_face("seoinguk6.jpg"))

seoinguk.append(extract_face("seoinguk7.jpg"))

seoinguk.append(extract_face("seoinguk8.jpg"))

seoinguk.append(extract_face("seoinguk9.jpg"))

seoinguk.append(extract_face("seoinguk10.jpg"))


In [170]:
cosineseo = []

In [171]:
for i in seoinguk:
    cosineseo.append(sp.cosine(i, i+1))

In [172]:
average(cosineseo)

0.043684196472167966

Similar to our previous findings, the average distance between the 10 photos of the same actor is very very low, compared to when we are comparing between different people. Hence, we will move forward with this model.

# Matching 

Next, we are going to subset the most similar looking actor and actress for every actor and actress in our datasets respectively so that we can recommend a match for them accordingly using the sentiment analysis matching.

In [238]:
actor = df13.iloc[:, [0,2,22]]

In [239]:
# actor.tocsv('actor.csv')

Unnamed: 0,Actor,Least_Array_Diff,Best_Match_Actor
0,LeeKyuHan,0.296106,YangKyungWon
1,ImShiWan,0.345610,LeeTaeRi
2,JaeHee,0.294929,SeoJiHoon
3,KimSungOh,0.349409,ParkEunSeok
4,JoHeeBong,0.361091,LeeSungJae
...,...,...,...
571,DongHa,0.308361,KimHyunJoong
572,SungJiRu,0.342572,ChoiMooSung
573,KimHyungMin,0.356046,ParkHoon
574,ParkByungEun,0.346609,KimJoonHan


In [240]:
actress = dF13.iloc[:, [0,2,22]]

In [242]:
# actress.to_csv('actress.csv')

In [310]:
senti = pd.read_csv("senti.csv")  #dataset from sentiment analysis matching

We will create 2 new columns for us to insert later on.

In [311]:
senti['actor1match'] = np.nan
senti['actor2match'] = np.nan

We use for loop to loop through every actors in actor in the dataframe and looping it through every actors in the senti csv (actor 1). Once we get a match, we append it inside a list, before inserting it into the senti csv column: actor1match.

In [312]:
match1 = []

for j in range (0,102):
    for i in range (0,474):
        if dF13.Best_Match_Actor[i] == senti.actor1[j]:
            match1.append(dF13.Actor[i])
            match1.append(j)

In [313]:
len(match1)

160

In [314]:
senti["actor1match"]=senti["actor1match"].astype(str)

In [315]:
match = []

In [316]:
for i in range(0,160):
    if str(match1[i]).isdigit() == True and senti.iloc[match1[i], 6] == "nan":
        senti.at[match1[i], 'actor1match'] = match1[i-1]
    elif str(match1[i]).isdigit() == True and senti.iloc[match1[i], 6] != "nan":
        match.append(senti.iloc[match1[i], 6])
        match.append(match1[i-1])
        senti.at[match1[i], 'actor1match'] = match
        match = []

We do the same for all actor2 inside the senti csv. We use for loop to loop through every actors in actor in the dataframe and looping it through every actors in the senti csv (actor2). Once we get a match, we append it inside a list, before inserting it into the senti csv column: actor2match.

In [318]:
match2 = []

for j in range (0,102):
    for i in range (0,474):
        if dF13.Best_Match_Actor[i] == senti.actor2[j]:
            match2.append(dF13.Actor[i])
            match2.append(j)

In [319]:
len(match2)

126

In [320]:
senti["actor2match"]=senti["actor2match"].astype(str)

In [321]:
match = []

In [323]:
for i in range(0,126):
    if str(match2[i]).isdigit() == True and senti.iloc[match2[i], 7] == "nan":
        senti.at[match2[i], 'actor2match'] = match2[i-1]
    elif str(match2[i]).isdigit() == True and senti.iloc[match2[i], 7] != "nan":
        match.append(senti.iloc[match2[i], 7])
        match.append(match2[i-1])
        senti.at[match2[i], 'actor2match'] = match
        match = []

In [326]:
senti.head()

Unnamed: 0.1,Unnamed: 0,pair,actor1,actor2,lineCount,score,actor1match,actor2match
0,0,"('ChaSeungWon', 'LeeSeungGi')",ChaSeungWon,LeeSeungGi,5,0.592857,,
1,1,"('JungHaeIn', 'SonYeJin')",JungHaeIn,SonYeJin,9,0.501087,,"[[[BangMinAh, KimYooJung], BangMinAh], KimYooJ..."
2,2,"('GoSungHee', 'YoonHyunMin')",GoSungHee,YoonHyunMin,5,0.487549,SeoYeJi,
3,3,"('ChoiJinHyuk', 'SongJiHyo')",ChoiJinHyuk,SongJiHyo,5,0.478333,,"[ShinDongMi, ShinDongMi]"
4,4,"('JungEunJi', 'KimJiSoo')",JungEunJi,KimJiSoo,6,0.466667,,


Since we only do it for female actresses, now we'll do the same process for male actors.

In [327]:
match2 = []

for j in range (0,102):
    for i in range (0,576):
        if df13.Best_Match_Actor[i] == senti.actor2[j]:
            match2.append(df13.Actor[i])
            match2.append(j)

In [328]:
len(match2)

150

In [329]:
match = []

In [330]:
for i in range(0,150):
    if str(match2[i]).isdigit() == True and senti.iloc[match2[i], 7] == "nan":
        senti.at[match2[i], 'actor2match'] = match2[i-1]
    elif str(match2[i]).isdigit() == True and senti.iloc[match2[i], 7] != "nan":
        match.append(senti.iloc[match2[i], 7])
        match.append(match2[i-1])
        senti.at[match2[i], 'actor2match'] = match
        match = []

In [332]:
match1 = []

for j in range (0,102):
    for i in range (0,576):
        if df13.Best_Match_Actor[i] == senti.actor1[j]:
            match1.append(df13.Actor[i])
            match1.append(j)

In [333]:
match = []

In [334]:
len(match1)

74

In [336]:
for i in range(0,74):
    if str(match1[i]).isdigit() == True and senti.iloc[match1[i], 6] == "nan":
        senti.at[match1[i], 'actor1match'] = match1[i-1]
    elif str(match1[i]).isdigit() == True and senti.iloc[match1[i], 6] != "nan":
        match.append(senti.iloc[match1[i], 6])
        match.append(match1[i-1])
        senti.at[match1[i], 'actor1match'] = match
        match = []

Cleaning the csv because there are lots of unnecessary '[' and ']' but before that we export it into a csv and then clean it because that way, all the square brackets will just be considered string instead of list so itll be easier to clean.

In [338]:
#senti.to_csv("sentiunclean.csv")

In [339]:
sentiunclean = pd.read_csv("sentiunclean.csv")

In [340]:
actor1match = [i.replace('[','').replace(']','').replace("'",'') if type(i)==str else '' for i in sentiunclean['actor1match']]

In [341]:
actor2match = [i.replace('[','').replace(']','').replace("'",'') if type(i)==str else '' for i in sentiunclean['actor2match']]

In [342]:
sentiunclean = sentiunclean.drop(columns = ['actor1match', 'actor2match'])

In [343]:
sentiunclean['actor1match'] = actor1match
sentiunclean['actor2match'] = actor2match
sentiunclean.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,pair,actor1,actor2,lineCount,score,actor1match,actor2match
0,0,0,"('ChaSeungWon', 'LeeSeungGi')",ChaSeungWon,LeeSeungGi,5,0.592857,"SungHyuk, BaeSooBin, SungHyuk, BaeSooBin",
1,1,1,"('JungHaeIn', 'SonYeJin')",JungHaeIn,SonYeJin,9,0.501087,"ParkJiBin, ParkJiBin","BangMinAh, KimYooJung, BangMinAh, KimYooJung"
2,2,2,"('GoSungHee', 'YoonHyunMin')",GoSungHee,YoonHyunMin,5,0.487549,SeoYeJi,"JangKiYong, Chani"
3,3,3,"('ChoiJinHyuk', 'SongJiHyo')",ChoiJinHyuk,SongJiHyo,5,0.478333,"LeeHyunJin, LeeHyunJin","ShinDongMi, ShinDongMi"
4,4,4,"('JungEunJi', 'KimJiSoo')",JungEunJi,KimJiSoo,6,0.466667,,


Next, we will export it into a csv so that we can use it for our dashboard

In [347]:
#sentiunclean.to_csv("senti_clean.csv")