In [None]:
# this script uses two lists of manually compiled interviewer and interviewee tags to identify who is speaking,
# and only return the speech of the relevant speaker

import re
import os

INTERVIEWER_MARKERS = [
"Douglas:",
"Mr. Oxenburgh:",
"Mr. Emery:",
"Mr. Mike Wooler:",
"Jackie Sam:",
"Mr. Anthony Rendell:",
"Anthony Rendell:",
"Interviewer:",
"QUESTION:",
"QUESTION :",
"Wright:",
"Kenneth Randall:",
"HENRY KAMM:",
"MR. PERIGO:",
"PROFESSOR MCINTYRE:",
"MR. COCKRAM:",
"ROBIN DAY:",
"MOHAMMAD NAHAR:",
"NAHAR:",
"PROF ITALIANDA:",
"PROF:",
"ADRIAN PORTER:",
"ROBERT KROON:",
"BRUCE KOHN:",
"GERALD STONE:",
"RICHARDSON:",
"MICHAEL RICHARDSON:",
"MOTOO KAEDE:",
"DR WINFRED SHARLAU:",
"SHARLAU:",
"L. KENNEDY:",
"KENNEDY:",
"GUENTHER SCHOLZ:",
"SCHOLZ:",
"PADRAIC FALLON:",
"BUSINESS TIMES:",
"Q:",
"Aik Yeow:",
"Reporter:",
"Qn:",
"MOS Desmond Lee:",
"Tom Mangold :",
"Peter Hazelhurst :",
"WINFRED SHARLAU:",
"CHANCELLOR SCHMIDT:",
"Barry Jordan :",
"CHUO KORON:",
"ALTAF GAUHAR:",
"Mr Derek:",
"MR. MANFRED VON JUTERCZENKA:",
"JUTERCZENKA:",
"Maj-Gen Agus :",
"Q1:",
"Q2:",
"Q3:",
"Q4:",
"Q5:",
"Q6:",
"Q7:",
"Q8:",
"Q9:",
"Q10:",
"Q11:",
"Q12:",
"Q13:",
"Christopher Lockwood:",
"Lockwood:",
"Mr Kazuo Nishi:",
"Nishi:",
"Q :",
"Minister Takeo Hinamura:",
"News Release:",
"Karyawan:",
"ANN:",
"Richard:",
"Mr. Sarkar :",
"Mushahid Ali \(Sunday Times\):",
"E.M. Rashid \(AFP\):",
"Selvaganapathy \(Tamil Murasu\):",
"Menon \(A.F.P.\):",
"Rashid \(A.F.P.\):",
"Muthukrishnan \(Malayan Times\):",
"Mr. Burns:",
"Mr. Bloodworth:",
"Mr. Singh:",
"Mr. Bernard Kalb:",
"Peter Hazelhurst :",
"Q1\)",
"Q2\)",
"NHK:",
"Louis Kraar:",
"Miss Bamber:",
"CNA:",
"ST:",
"SMDN:",
"TODAY:",
"BT:",
"WB:",
"Lu Caixia \(Lianhe Zaobao\):",
"Adrian Tay \(AsiaOne\):",
"Terence Lim \(PIONEER\):",
"S Ramesh \(CNA\):",
"Chong Chee Kin \(ST\):",
"Geraldine Soh \(938 Live\):",
"Ng Chun Kiat \(Asahi Shimbun\):",
"Shefali \(ST\):",
"Teh Joo Lin \(ST\):",
"Mr Stephen Smith:",
"S. Ramesh \(Channel News Asia\):",
"Gillian Wong \(Associated Press\):",
"Shefali Rekhi \(Straits Times\):",
"Dr Purnomo Yusgiantoro:",
"Question\(to Dr Amy Khor\):",
"Question\(to ESM\):",
"Question\(to ESM\):",
"Dr Amy Khor:",
"Questioner :",
"Peter Snow:",
"Minister Rini:",
"Mr. Seymour:",
"Mr. Croll:",
"Mr. Croll :",
"Ray Herndon:",
"Mr. SPIVAK:",
"MR. NEWMAN:",
"MR. SALISBURY:",
"MR. NOYES:",
"EL GAMAL:",
"Mr. Peterson:",
"Fred Emery:",
"Ray Herndon:",
"Lewis Simons:",
"Francois Nivolon :",
"Dr Noordin Sopiee:",
"Adrian Porter :",
"Mackenzie :",
"Mr. EMMON ANDREWS:",
"ANDREWS:",
"Tom Mangold :",
"David:",
"Bonavia:",
"Karyawan:",
"Bennetts:",
"Belkind:",
"Emery:",
"Svirin:",
"Whiting:",
"Sherwell:",
"Matsuda:",
"Heine:",
"CROCKER SNOW:",
"SNOW:",
"McCrone:",
"Belkind:",
"Mr. HERMAN:",
"Mr. KARNOW:",
"Mr. KALB:",
"Tan Yingjia \(Lianhe Zaobao\):",
"Narrator:",
"Ashbolt:",
"PRESSMEN:",
"PRESSMAN:",
"QUESTIONER:",
"McCrone:",
"S. Pontappidan:",
"Mr Davies :",
"Dr Olaf Ihlau :",
"Tamil Murasu:",
"Reynolds News \(Alex Josey\):",
"Press Trust of India \(Muthukrishnan\):",
"Reuters \(Peter Smark\):",
"Peter Smark:",
"A.F.P. \(Mr. Rashid\):",
"Reuters:",
"Straits Times \(Jackie Sam\):",
"Jackie Sam:",
"Reuter \(David Chipp\):",
"Bennetts:",
"Mr. Cook:",
"Bill Gasson:",
"Mr. Gasson:",
"Mr. Killen:",
"Michael Barrett:",
"Barrett:",
"L. KENNEDY:",
"KENNEDY:",
"Alex Kennedy \(Associated Press\):",
"William Choong \(Straits Times\):",
"Nopporn Wong-Anan \(Reuters\):",
"Ramesh \(CNA\):",
"Reiko \(JIJI Press\):",
"Mr Thaksin:",
"Mr Lagos:",
"Michael Barrett:",
"Douglas:",
"ROBIN:",
"S. Pontappidan:",
"Mr Hamid Najafi Tabrizi:",
"Mr Mohamed Moheeb Jaber:",
"Mr Hamed Al Sewerky:",
"Mr Hossam Abd El Naby:",
"Mr K T Chacko:",
"Mr Rashed Saleh Al Oraimi:",
"Mr Touraj Shiralilou:",
"MrTouraj Shiralilou:",
"Mr Abbas Moussa:",
"Mr Augustin Lee:",
"Mr V P Hirubalan:",
"Mr Neil Farley:",
"Mr Ho Tong Yen:",
"Press Trust of India:"
"Mr. Marshall:"
"MR. SPEAKER:",
"The Minister for Culture \(Mr. S Rajaratnam\):",
"Mr. Rajah:",
"Mr. Rajaratnam:",
"Dr. Toh Chin Chye:",
"An Hon Member:",
"Mr. A.P. Rajah \(Farrer Park\):",
"Announcer:",
"Chairman \(Mr James Fu\):",
"Dr Gwee Yee Nean:",
"Ung Gim Sei:",
"Prof Koh Lip Lin:",
"Chairman:",
"Lim Nai Tien:",
"Dr Gwee Yee Hean:",
"Dr Ang Koh-Ping:",
"Dr Ker Sin Tze:",
"Dr Ker:",
"Mr Suen Y-Chern:",
"Mr Sng Aik Liang:",
"Sng:",
"Mr Ng Kok Lip:",
"Madam Lee Sai Yong:",
"Mdm Lee:",
"Miss Lim Gek Khim:",
"Miss Lim:",
"Madam Shi May-Chiau:",
"Prof Teh Heng Hoon:",
"Mr Hee Pheng Fong:",
"Mr Loo Shaw Chang:",
"Dr Mok Seow Koon:",
"Tay Seow Huah:",
"John Drysdale:",
"Dr Ruth Wong:",
"Dr Wong:",
"Dr Gwee:",
"Brother Patrick:",
"Chairman:",
"Miss Chan Mo-Yu:",
"Miss Chan:",
"Miss Liu Min Kune:",
"Prof Teh Hoon Heng:",
"S. Gopinathan:",
"Mr Tan Tai Wei:",
"Mr Ng Ser Kwei:",
"Miss Grace Toh:",
"Mr Saidi bin Shariff:",
"Fong Hoe Fang:",
"Mr Wong Lian Aik:",
"Mr Tay Seow Huah \(Chairman\):",
"PRESIDENT MARCOS:",
"V. TANEDO \(TIMES JOURNAL\):",
"MR GENOVEA:",
"GENOVEA:",
"HANN BROWNE \(FAR EAST BROADCASTING COMPANY\):",
"BROWNE:",
"RODNEY TASKER \(REVIEW, FEER\):",
"TASKER:",
"MARGAWANG \(BULLETIN TODAY\):",
"TONY LOZANO \(CHANNEL 7, TV\):",
"LOZANO:",
"KINYA SATO \(ASAHI SHIMBUN\):",
"SATO:",
"MICHIHIRO SHIRAISHI \(YOMIURI SHIMBUN\):",
"SHIRAISHI:",
"Mr Haass:",
"Mr Haas:",
"Mr Abdullah:",
"Moderator:",
"King Abdullah:",
"Mr Low:",
"Mr Sitoh:",
"Libyan Official:",
"Question 1:",
"Question 2:",
"Question 3:",
"Interpreter:",
"Mr Li: ",
"WB:",
"Sir John:",
"DEAN:",
"Straits Times:",
"A.F.P.:",
"Sunday Mail:",
"Observer:",
"Minister for Culture:",
"Inche Ahmad Ibrahim:",
"Radio Singapore \(S.C.Lim\):",
"Dr. Lee Siew Choh:",
"Mr. Chairman:",
"Question from Mr. Lim Chew Hock:",
"QUESTION BY JOSEPH LYNUS:",
"Mr. Woodhull:",
"Question by Mr. Manjit Singh:",
"Question by Mr Yee Jenn Jong:",
"Question by Er Dr Lee Bee Wah: ",
"Question by Ms Tan Su Shan: ",
"Question by Assoc Prof Tan Kheng Boon Eugene: ",
"Mdm Speaker:",
"Mr Yee Jenn Jong:",
"Dr Lam Pin Min:",
"Dr Purnomo Yusgiantoro:",
"S. Ramesh \(Channel News Asia\):",
"Shefali Rekhi \(Straits Times\):",
"Baker:",
"The Wall Street Journal:",
"Mr Alex Yam Ziming:",
"Answer by Minister:",
"Er Dr Lee Bee Wah:",
"Mr Seah Kian Peng:",
"Mr Png Eng Huat:",
"Mr Lim Biow Chuan:",
"Mr Leon Perera:",
"Mr Louis Ng Kok Kwang:",
"Dr Chia Shi-Lu:",
"Reply by Minister:",
"Shaun Seow:",
"Phua Mei Pin:",
"Phua:",
"Harpreet:",
"Daniel:",
"Lester:",
"Yvonne Lim:",
"Flora Tan:",
"Renita Chua:",
"Li Shin:",
"Siew Pheng:",
"Harrisfazila Zaidi:",
"Sanjay Perera:",
"Perera:",
"Jonathan Wee:",
"Madam Liu      :",
"Mrs Huang       :",
"Mr Chan Kai Yau :",
"Sister Elizabeth :",
"Mrs Anna Tham:",
"Mr Ernest Lau :",
"Brother Byrne  :",
"Mr Harry Tan   :",
"Sister Maria Ng:",
"Mr Ang Nam Piau :",
"ALTAF GAUHAR:",
"MOHD NOOR:",
"RAHMAT BUANG:",
"JURI BIN WARI:",
"HAJI MOHAMAD:",
"ABU SAMAD:",
"PONIADI BIN ROSDI:",
"EL GAMAL:",
"MOORE:",
"RANDALL:",
"MEDIANSKY:",
"Mr. P. Govindaswamy:",
"Mr. N. Govindasamy:",
"JOHN BELL \(SUNDAY TIMES, LONDON\):",
"DR. RUDOLF HERLT: ",
"HERLT:",
"OGER BOURDEAUD’ HUY   \(FINANCIAL & ECONOMIC DAILY BELGIUM\):",
"MARJORIE DEANE   \(ECONOMIST, LONDON\):",
"MARJORIE DEANE:",
"HANS POT  \(FEM, HOLLAND\):",
"BRIAN BELL:",
"ALAIN VERNHOLES  \(LE MON DE, FRANCE\):",
"J. BLANDEN:",
"DR. HANS KOEPPL \(AUSTRIA\):",
"DR. KOEPPL:",
"CLAUS DERTINGER   \(THE WORLD, GERMANY\):",
"H. DUFFY  \(GUARDIAN\):",
"DUFFY:",
"KENNETH FLEET \(DAILY TELEGRAPH, LONDON\):",
"FLEET:",
"JAPANESE OFFICIAL :",
"NHK:",
"ANNOUNCER:",
"JOHN TULSA \(CHAIRMAN\):",
"TULSA:",
"WHITLAM:",
"MORGAN-GILES:",
"CHALFONT:",
"HAROLD EVANS \(Editor of the SUNDAY TIMES,London\):",
"HAROLD EVANS:",
"IMMANUAEL BIRNBAUM \(Sueddeutsche Zeltung, Munich\):",
"TERKEL TERKELSEN \(Berlingske Tidende, Copenhagen\):",
"E.J.B. ROSE \(Westminister Press, London\):",
"E.J.B. ROSE:",
"MICHEL CORDEY \(France-Soir, Paris\):",
"MICHEL GORDEY:",
"L.K.JAKANDE \(Nigerian Tribune, Lagos\):",
"Jimmy Hahn:",
"Kwant:",
"Voice:",
"Dr. Kaunda:",
"Mr. Trudeau:",
"Professor Eto:",
"Mr. Thomson:",
"Challis:",
"David Excel:",
"Excel:",
"Ward:",
"Selvaganapathy:",
"Menon:",
"Ong Beng Chuan:",
"Suleiman Jeem:",
"Bakar:",
"Almenoar:",
"Chia Poteik:",
"MISS FREDERICK:",
"MR. EVANS:",
"MR. TOPPING:",
"MR. NOYAS:",
"MR. TOPPLING:",
"ANN:",
"Wu Shih:",
"Reporters:",
"Prime Ministers:",
"Devan Nair:",
"Lim Chong Eu:",
"Ong Kee Hui:",
"Barker:",
"Gasson:",
"Petersen:",


]
INTERVIEWEE_MARKERS = [
"Prime Minister:",
"Prime Minister :",
"Mr. Lee:",
"ANSWER:",
"PRIME MINISTER:",
"Minister:",
"ESM:",
"Prime Minister :",
"Prime Minister :",
"Mr Goh:",
"A:",
"PM:",
"Prime :",
"A :",
"Minister Yeo:",
"Minister George Yeo:",
"Defence Minister:",
"Minister for Defence Dr Ng Eng Hen:",
"Dr Ng:",
"BG Yeo:",
"MINISTER :",
"BG Lee:",
"SM:",
"Dr. Toh:",
"Dr . Toh:",
"Dr Ng Eng Hen:",
"Min:",
"DPM:",
"Emeritus Senior Minister Goh:",
"MR. LEE:",
"Mr. LEE KUAN YEW:",
"DPM Teo:",
"DPM/Minister:",
"MrGoh:",
"LEE KUAN YEW:",
"SM Goh:",
"THE PRIME MINISTER:",
"Senior Minister:",
"The P.M.:",
"P.M.:",
"Mr. Lee Kuan Yew:",
"Minister’s Reply:",
"SMS:",
"The Prime Minister:",

]

def extract_interviewee_text(sample_text):
    interviewee_text = ""
    in_interview = False

    interviewee_pattern = "|".join(INTERVIEWEE_MARKERS)
    interviewer_pattern = "|".join(INTERVIEWER_MARKERS)
    pattern = f"({interviewee_pattern}|{interviewer_pattern})"
    segments = re.split(pattern, sample_text)

    for i in range(1, len(segments), 2):
        marker = segments[i]
        text = segments[i + 1]

        if marker in INTERVIEWEE_MARKERS:
            in_interview = True
            interviewee_text += text

        elif marker in INTERVIEWER_MARKERS:
            in_interview = False

    return interviewee_text.strip()


def process_text_file(file_path, output_dir, output_subfolder):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    interviewee_text = extract_interviewee_text(text)

    if interviewee_text:
        file_name = os.path.basename(file_path)
        output_subfolder_path = os.path.join(output_dir, output_subfolder)
        if not os.path.exists(output_subfolder_path):
            os.makedirs(output_subfolder_path)
        output_file_path = os.path.join(output_subfolder_path, file_name.replace(".txt", "-INTERVIEW.txt"))

        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            output_file.write(interviewee_text)

def process_folder(input_folder, output_folder):
    for root, dirs, files in os.walk(input_folder):
        for file in files:
            if file.endswith(".txt"):
                file_path = os.path.join(root, file)
                subfolder = os.path.relpath(root, input_folder)
                output_subfolder = os.path.join(output_folder, subfolder)
                process_text_file(file_path, output_folder, output_subfolder)

input_folder = # input folder
output_folder = # output folder
process_folder(input_folder, output_folder)

