In [2]:
!pip install lexical-diversity
!pip install tag_fixer

Collecting lexical-diversity
  Downloading https://files.pythonhosted.org/packages/fb/32/6800b1d0f65fa488a7411d77195e94ad6b7b2002040805f776c8b3fca3a2/lexical_diversity-0.1.0-py3-none-any.whl (117kB)
Installing collected packages: lexical-diversity
Successfully installed lexical-diversity-0.1.0


In [6]:
from lexical_diversity import lex_div as ld

In [7]:
text = """The state was named for the Colorado River, which Spanish travelers named the Río Colorado for the ruddy silt the river carried from the mountains. The Territory of Colorado was organized on February 28, 1861, and on August 1, 1876, U.S. President Ulysses S. Grant signed Proclamation 230 admitting Colorado to the Union as the 38th state. Colorado is nicknamed the "Centennial State" because it became a state a century after the signing of the United States Declaration of Independence. Colorado is bordered by Wyoming to the north, Nebraska to the northeast, Kansas to the east, Oklahoma to the southeast, New Mexico to the south, Utah to the west, and touches Arizona to the southwest at the Four Corners. Colorado is noted for its vivid landscape of mountains, forests, high plains, mesas, canyons, plateaus, rivers, and desert lands. Colorado is part of the western or southwestern United States, and one of the Mountain States. Denver is the capital and most populous city of Colorado. Residents of the state are known as Coloradans, although the antiquated term "Coloradoan" is occasionally used."""

In [8]:
## tokenize
tok = ld.tokenize(text)
##lemmatize
flt = ld.flemmatize(text)

['the', 'state', 'was', 'named', 'for', 'the', 'colorado', 'river', 'which', 'spanish']


In [11]:
## Lexical diversity score
##Simple TTR
simple_TTR = ld.ttr(flt)

##Root TTR
root_TTR = ld.root_ttr(flt)

## Log TTR 
log_TTR = ld.log_ttr(flt)

## Mass TTR
mass_TTR = ld.maas_ttr(flt)

## Mean segmental TTR (MSTTR)
ms_TTR = ld.msttr(flt)

In [28]:
## speaker content
speaker1 = """We stand in line and pay expensive prescription drugs. We have to have a healthcare guarantee. If you’re sick, you’re seen it. And in America, you never go broke because of it."""
speaker2 = """A lot of you have been talking tonight about these government healthcare plans that you proposed in one form or another. This is a show of hands question. And hold them up for a moment so people can see. Raise your hand if—if your government plan would provide coverage for undocumented immigrants"""
speaker3 = """Because our country is healthier when everybody is healthier. And remember, we’re talking about something people are getting a—given a chance to buy into. In the same way that there are undocumented immigrants in my community who pay, they pay sales taxes, they pay property taxes directly or indirectly. This is not about a handout. This is an insurance program. And we do ourselves no favor by having 11 million undocumented people in our country be unable to access healthcare."""

## List of the speakers
speaker_list = ['speaker_1', 'speaker_2', 'speaker_3']

## content list
corpus = [speaker1 , speaker2, speaker3]

In [26]:
similarity = similarity_score(corpus, speaker_list)
similarity

Unnamed: 0,speaker_1,speaker_2,speaker_3
speaker_1,1.0,0.023215,0.09468
speaker_2,0.023215,1.0,0.126181
speaker_3,0.09468,0.126181,1.0


## start here for our data

In [27]:
## Define the function
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

def similarity_score(corpus, speaker_list):
    """ Input 1: takes speaker content 
        Input 2: takes the list of speaker names 
        Output : Similarity matrix between the contents 
    """
    vect = TfidfVectorizer(min_df=1, stop_words="english")                                                                                                                                                                                                   
    tfidf = vect.fit_transform(corpus)                                                                                                                                                                                                                       
    pairwise_similarity = tfidf * tfidf.T 
    similarity_matrix = pd.DataFrame(pairwise_similarity.toarray())
    similarity_matrix.columns = speaker_list
    similarity_matrix.index = speaker_list
    return similarity_matrix

In [48]:
## read the script
import pandas as pd
df = pd.read_excel('C:/Users/monali/Desktop/Speech to text/speech.xlsx') #for an earlier version of Excel, you may need to use the file extension of 'xls'

## get unique speakers
speaker_list = df['Speaker'].unique()

## create speaker paragraphs
speaker_list
speaker_master_list = []
for speaker in speaker_list:
    df_temp = df.loc[df['Speaker'] == speaker]
    #print(speaker)
    temp_speaker = """"""
    for ind in df_temp.index:
        subtext = df_temp['Text'][ind]
        temp_speaker = temp_speaker +""" """ + subtext
    #speaker_data = []
    #speaker_data.append(speaker)
    #speaker_data.append(temp_speaker)  
    speaker_master_list.append(temp_speaker)

In [50]:
## generate similarity matrix
similarity_matrix = similarity_score(speaker_master_list, speaker_list)

In [52]:
similarity_matrix.to_csv("similarity_matrix.csv")
similarity_matrix