# Basic Phrase & Word Searches via Python

### SQL: Import data and create convert to Pandas dataframe

In [6]:
import pandas as pd  

# PY-ODBC, an open source Python package that makes accessing ODBC databases simple
import pyodbc

# SQL-Alchemy; a SQL toolkit that gives application developers the full power and flexibility of SQL
import sqlalchemy

# for Regular Expression (RegEx)
import re

# to tokenize text into words
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tshob\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
# Sets up an engine for an SQLAlchemy application, defining how to connect with a database
# Takes the general form:  dialect+driver://username:password@host:port/database
# DESKTOP-UBSKTQ6\tshob

from sqlalchemy import create_engine
engine = create_engine('mssql+pyodbc://tshob:dbmaster@MSFT SQL Server')

In [8]:
# Format: display full column width
pd.set_option('display.max_colwidth', -1)

In [9]:
# Query from transcript database table

query = """
SELECT *
FROM master.dbo.transcript_row
ORDER BY id ASC
"""

Data = pd.read_sql(query, engine)
Data.head()

Unnamed: 0,id,text
0,ABC123,How much money do I have in my money account?\r
1,ABC456,Where is my money?\r
2,ABC789,"Hello, how are you today? What is your name?\r"
3,DEF123,My money market fund is my only money of my account
4,DEF456,Is my money now in my account? I need to know if my account has any of my money


In [10]:
# Convert to Pandas Dataframe

pandasDF = pd.DataFrame(Data)

# When using Spark session
# pandasDF = data.toPandas()
# print(pandasDF)

# Clean data
# Remove unwanted characters and convert to all lowercase

pandasDF['text'] = pandasDF['text'].str.replace(',', ' ')
pandasDF['text'] = pandasDF['text'].str.replace('.', '')
pandasDF['text'] = pandasDF['text'].str.replace('?', '')
pandasDF['text'] = pandasDF['text'].str.replace('\r', '')
pandasDF['text'] = pandasDF['text'].str.lower()

print(pandasDF)

       id  \
0  ABC123   
1  ABC456   
2  ABC789   
3  DEF123   
4  DEF456   
5  DEF789   
6  GHI123   
7  GHI456   
8  GHI789   

                                                                                                                                                                                                                                               text  
0  how much money do i have in my money account                                                                                                                                                                                                      
1  where is my money                                                                                                                                                                                                                                 
2  hello  how are you today what is your name                                                                                      

In [20]:
print(pandasDF.dtypes)
type(pandasDF['text'])

id      object
text    object
dtype: object


pandas.core.series.Series

#### Confirm data types of Dataframe fields

### Count number of words between Phrase 1 and Phrase 2

In [7]:
def phrase_finder(text: str, str1: str, str2: str):
    results = []
    for match1 in re.finditer(str1, text):
        for match2 in re.finditer(str2, text):
            if match1.end() < match2.start():
                results.append(text[match1.end():match2.start()])
                break
    return results

pandasDF['between_words'] = pandasDF['text'].apply(phrase_finder, args=('test word', 'practice sentence'))
# pandasDF.drop(columns = ['text'], axis = 1, inplace = True)

print(pandasDF)

       id  \
0  ABC123   
1  ABC456   
2  ABC789   
3  DEF123   
4  DEF456   
5  DEF789   
6  GHI123   
7  GHI456   

                                                                                                                                                                                                                                             text  \
0  how much money do i have in my money account                                                                                                                                                                                                     
1  where is my money                                                                                                                                                                                                                                
2  hello  how are you today what is your name                                                                                                      

### Identify all positions of Keyword - (words only, not phrases - does not recognize spaces)

#### Show all word-positions of the word "money"

In [32]:
def indices(string, search_word):   
    string = string.split()
    return [i for i, x in enumerate(string) if x == search_word] 

# Apply search function and insert search word
pandasDF['money_position'] = pandasDF['text'].apply(indices, search_word = 'money')
 
# Add fixed Category name column
pandasDF.insert(2, 'category', 'money talk') 

print(pandasDF) 

# Check if dataframe column List contains at least one element
# pandasDF.insert(3, 'violation_1', np.where(pandasDF['violation_position'].map(lambda d: len(d)) > 0, 1, 0)) 

# Count number of elements in dataframe column List
# pandasDF.insert(4, 'violation_2', pandasDF.violation_position.str.len())

       id                                               text    category  \
0  ABC123       how much money do i have in my money account  money talk   
1  ABC456                                  where is my money  money talk   
2  ABC789         hello  how are you today what is your name  money talk   
3  DEF123  my money market fund is my only money of my ac...  money talk   
4  DEF456  is my money now in my account i need to know i...  money talk   
5  DEF789  is my next to next to my account next to is my...  money talk   

  money_position  
0         [2, 8]  
1            [3]  
2             []  
3         [1, 7]  
4        [2, 18]  
5             []  


### Word count, per record - (words only, not phrases - does not recognize spaces)

#### Count all instances of "money", per string

In [33]:
def check_occurences(string, word = 'money'):
    string_list = string.split(' ')
    indices_word = [i for i, x in enumerate(string_list) if x == word]
    result = 0
    for i in indices_word:
        result += 1
    return result 

pandasDF['money_count'] = pandasDF['text'].apply(lambda x: check_occurences(x)) 

print(pandasDF)

       id                                               text    category  \
0  ABC123       how much money do i have in my money account  money talk   
1  ABC456                                  where is my money  money talk   
2  ABC789         hello  how are you today what is your name  money talk   
3  DEF123  my money market fund is my only money of my ac...  money talk   
4  DEF456  is my money now in my account i need to know i...  money talk   
5  DEF789  is my next to next to my account next to is my...  money talk   

  money_position  money_count  
0         [2, 8]            2  
1            [3]            1  
2             []            0  
3         [1, 7]            2  
4        [2, 18]            2  
5             []            0  


### Phrase count

#### Count all instances of "my money", per string

In [34]:
pandasDF['my_money_count'] = pandasDF['text'].str.count('my money')
print(pandasDF)

       id                                               text    category  \
0  ABC123       how much money do i have in my money account  money talk   
1  ABC456                                  where is my money  money talk   
2  ABC789         hello  how are you today what is your name  money talk   
3  DEF123  my money market fund is my only money of my ac...  money talk   
4  DEF456  is my money now in my account i need to know i...  money talk   
5  DEF789  is my next to next to my account next to is my...  money talk   

  money_position  money_count  my_money_count  
0         [2, 8]            2               1  
1            [3]            1               1  
2             []            0               0  
3         [1, 7]            2               1  
4        [2, 18]            2               2  
5             []            0               0  


#### Count all instances of "my money account", per string

In [35]:
pandasDF['my_money_account_count'] = pandasDF['text'].str.count('my money account')
print(pandasDF)

       id                                               text    category  \
0  ABC123       how much money do i have in my money account  money talk   
1  ABC456                                  where is my money  money talk   
2  ABC789         hello  how are you today what is your name  money talk   
3  DEF123  my money market fund is my only money of my ac...  money talk   
4  DEF456  is my money now in my account i need to know i...  money talk   
5  DEF789  is my next to next to my account next to is my...  money talk   

  money_position  money_count  my_money_count  my_money_account_count  
0         [2, 8]            2               1                       1  
1            [3]            1               1                       0  
2             []            0               0                       0  
3         [1, 7]            2               1                       0  
4        [2, 18]            2               2                       0  
5             []            0      

### Count Word 1 NEAR Word 2, per string

#### Count all instances of "my" NEAR "money", within 3 words of each other

In [36]:
def identify_instances(string, word_1 = 'my', word_2 = 'money', proximity = 3):
    string_list = string.split(' ')
    indices_word_1 = [i for i, x in enumerate(string_list) if x == word_1]
    indices_word_2 = [i for i, x in enumerate(string_list) if x == word_2]
    result = 0
    for i in indices_word_1:
        for j in indices_word_2:
            _distance = abs(i - j)
            if _distance <= proximity:
                result += 1
    return result

pandasDF['my_NEAR_money_count'] = pandasDF['text'].apply(lambda x: identify_instances(x))

print(pandasDF)

       id                                               text    category  \
0  ABC123       how much money do i have in my money account  money talk   
1  ABC456                                  where is my money  money talk   
2  ABC789         hello  how are you today what is your name  money talk   
3  DEF123  my money market fund is my only money of my ac...  money talk   
4  DEF456  is my money now in my account i need to know i...  money talk   
5  DEF789  is my next to next to my account next to is my...  money talk   

  money_position  money_count  my_money_count  my_money_account_count  \
0         [2, 8]            2               1                       1   
1            [3]            1               1                       0   
2             []            0               0                       0   
3         [1, 7]            2               1                       0   
4        [2, 18]            2               2                       0   
5             []            0

#### Count all instances of "money" NEAR "account", within 3 words of each other

In [37]:
def check_occurences(string, word_1 = 'money', word_2 = 'account', proximity = 3):
    string_list = string.split(' ')
    indices_word_1 = [i for i, x in enumerate(string_list) if x == word_1]
    indices_word_2 = [i for i, x in enumerate(string_list) if x == word_2]
    result = 0
    for i in indices_word_1:
        for j in indices_word_2:
            _distance = abs(i - j)
            if _distance <= proximity:
                result += 1
    return result

pandasDF['money_NEAR_account_count'] = pandasDF['text'].apply(lambda x: check_occurences(x))

print(pandasDF)

       id                                               text    category  \
0  ABC123       how much money do i have in my money account  money talk   
1  ABC456                                  where is my money  money talk   
2  ABC789         hello  how are you today what is your name  money talk   
3  DEF123  my money market fund is my only money of my ac...  money talk   
4  DEF456  is my money now in my account i need to know i...  money talk   
5  DEF789  is my next to next to my account next to is my...  money talk   

  money_position  money_count  my_money_count  my_money_account_count  \
0         [2, 8]            2               1                       1   
1            [3]            1               1                       0   
2             []            0               0                       0   
3         [1, 7]            2               1                       0   
4        [2, 18]            2               2                       0   
5             []            0

### Count Word 1 NOT NEAR Word 2

#### Count all instances of "money" NOT NEAR "fund", within 2 words of each other

In [38]:
def count_word_proximity(sentence, word1, word2, proximity):
    word1_indices = [i for i, word in enumerate(sentence.split()) if word == word1]
    word2_indices = [i for i, word in enumerate(sentence.split()) if word == word2]
    count = 0
    for word1_index in word1_indices:
        if not any(abs(word1_index - word2_index) <= proximity for word2_index in word2_indices):
            count += 1
    return count 

word = 'money'
not_word = 'fund'
proximity = 2

pandasDF['money_NOT_NEAR_account_count'] = pandasDF['text'].apply(lambda x: count_word_proximity(x, word, not_word, proximity))
 
print(pandasDF)

       id                                               text    category  \
0  ABC123       how much money do i have in my money account  money talk   
1  ABC456                                  where is my money  money talk   
2  ABC789         hello  how are you today what is your name  money talk   
3  DEF123  my money market fund is my only money of my ac...  money talk   
4  DEF456  is my money now in my account i need to know i...  money talk   
5  DEF789  is my next to next to my account next to is my...  money talk   

  money_position  money_count  my_money_count  my_money_account_count  \
0         [2, 8]            2               1                       1   
1            [3]            1               1                       0   
2             []            0               0                       0   
3         [1, 7]            2               1                       0   
4        [2, 18]            2               2                       0   
5             []            0

### Count Phrase 1 NEAR Phrase 2

#### Count all instances of "my money" NEAR "my account", within 3 words of each other

In [39]:
def phrase_finder(text: str, str1: str, str2: str, distance: int) -> int:
    results = 0
    for match1 in re.finditer(str1, text):
        for match2 in re.finditer(str2, text):
            if match1.end() < match2.start():
                between_matches = text[match1.end():match2.start()]
                if len(re.findall(r'\w+', between_matches)) <= distance:
                    results += 1
            elif match2.end() < match1.start():
                between_matches = text[match2.end():match1.start()]
                if len(re.findall(r'\w+', between_matches)) <= distance:
                    results += 1
            else:
                pass
    return results

pandasDF['my_money_NEAR_my_account'] = pandasDF['text'].apply(phrase_finder, args = ('my money', 'my account', 3))

print(pandasDF)

       id                                               text    category  \
0  ABC123       how much money do i have in my money account  money talk   
1  ABC456                                  where is my money  money talk   
2  ABC789         hello  how are you today what is your name  money talk   
3  DEF123  my money market fund is my only money of my ac...  money talk   
4  DEF456  is my money now in my account i need to know i...  money talk   
5  DEF789  is my next to next to my account next to is my...  money talk   

  money_position  money_count  my_money_count  my_money_account_count  \
0         [2, 8]            2               1                       1   
1            [3]            1               1                       0   
2             []            0               0                       0   
3         [1, 7]            2               1                       0   
4        [2, 18]            2               2                       0   
5             []            0

#### Count Phrase 1 NEAR Phrase 2 - ALTERNATE METHOD

In [40]:
def count_proximity(s, s1, s2, distance):
    xs = [m.start() for m in re.finditer(s1, s)]
    ys = [m.start() for m in re.finditer(s2, s)]
  
    count = i = j = 0
    while i < len(xs) and j < len(ys):
        x = xs[i]
        y = ys[j]
        if x <= y:
            count += int(len(s[x + len(s1) : y].split()) <= distance)
            i += 1
        else:
            count += int(len(s[y + len(s2) : x].split()) <= distance)
            j += 1
  
    return count

pandasDF['my_money_NEAR_my_account_alt'] = pandasDF['text'].apply(count_proximity, args = ('my money', 'my account', 3))

print(pandasDF)

       id                                               text    category  \
0  ABC123       how much money do i have in my money account  money talk   
1  ABC456                                  where is my money  money talk   
2  ABC789         hello  how are you today what is your name  money talk   
3  DEF123  my money market fund is my only money of my ac...  money talk   
4  DEF456  is my money now in my account i need to know i...  money talk   
5  DEF789  is my next to next to my account next to is my...  money talk   

  money_position  money_count  my_money_count  my_money_account_count  \
0         [2, 8]            2               1                       1   
1            [3]            1               1                       0   
2             []            0               0                       0   
3         [1, 7]            2               1                       0   
4        [2, 18]            2               2                       0   
5             []            0

### Count Phrase 1 NOT NEAR Phrase 2

#### Count all instances of "my money" NOT NEAR "my account", within 3 words of each other

In [144]:
def phrase_function(text: str):    
    words = word_tokenize(text)
    phrase1 = ["test", "word"]  # Phrase 1
    phrase2 = ["practice", "sentence"]  # Phrase 2
    distance = 3 # Number of words between Phrase 1 and Phrase 2
    count = 0
    i = 0
    while i < len(words) - 1:
        if words[i] == phrase1[0] and words[i + 1] == phrase1[1]:
            found_phrase2 = False
            j = i + 2
            while j < len(words) - 1 and j <= i + distance + 1:
                if words[j] == phrase2[0] and words[j + 1] == phrase2[1]:
                    found_phrase2 = True
                    break
                j += 1
            k = i - 2
            while k >= 0 and k >= i - distance - 1:
                if words[k] == phrase2[0] and words[k + 1] == phrase2[1]:
                    found_phrase2 = True
                    break
                k -= 1
            if not found_phrase2:
                count += 1
            i += 2
        else:
            i += 1
    return count

pandasDF['count'] = pandasDF['text'].apply(phrase_function)

print(pandasDF)

       id  \
0  ABC123   
1  ABC456   
2  ABC789   
3  DEF123   
4  DEF456   
5  DEF789   
6  GHI123   
7  GHI456   

                                                                                                                                                                                                                                            text  \
0  how much money do i have in my money account                                                                                                                                                                                                    
1  where is my money                                                                                                                                                                                                                               
2  hello  how are you today what is your name                                                                                                         

### Alternate method; look ahead only

In [69]:
def phrase_function(text: str):    
    words = word_tokenize(text)
    phrase1_word_count = 2  # "test word"    
    between_distance = 5  # Searches from last word in Phrase 1 ('word') to last word in Phrase 2 ('sentence')
    # Phrase 1 --> "test" = variable 'word' ==> i, and 'word' = variable 'words' ==> i + 1
    # Phrase 2 --> "practice" ==> j, "sentence" ==> j + 1
    count = 0
    
    # Variable j is used as the index of the current word being iterated over in the loop, 
    # and the len(words) function is used to get the total number of words in the list.
    
    # Looking forward from Phrase 1
    for i, word in enumerate(words):
        # If token and immediate subsequent token = "is my" ...
        if word == "test" and words[i + 1] == "word": # 'i' starts at position "is", so "my" would occur at i + 1
            # Loop checks next 3 words after "is my" and ...
            # 'j' starts at first word position of Phrase #2, which is "my"
            for j in range(i + (phrase1_word_count - 1), i + (phrase1_word_count - 1) + (between_distance + 1)):                
                # If it finds "my account", it breaks the loop and doesn't increment the count
                if j <= len(words) \
                and words[j] == "practice" and words[j + 1] == "sentence":
                    break
            else: # Otherwise, add one to the count as a positive match found
                count += 1
    return count

pandasDF['count'] = pandasDF['text'].apply(phrase_function)

print(pandasDF)

# (phrase1_word_count - 1)

       id  \
0  ABC123   
1  ABC456   
2  ABC789   
3  DEF123   
4  DEF456   
5  DEF789   
6  GHI123   
7  GHI456   
8  GHI789   

                                                                                                                                                                                                                                               text  \
0  how much money do i have in my money account                                                                                                                                                                                                       
1  where is my money                                                                                                                                                                                                                                  
2  hello  how are you today what is your name                                                                                   

In [None]:
ABC123 1
ABC456 1
ABC789 0
DEF456 0
DEF123 1

In [104]:
def count_word_proximity(sentence, word1, word2, proximity):
    word1_indices = [i for i, word in enumerate(sentence.split()) if word == word1]
    word2_indices = [i for i, word in enumerate(sentence.split()) if word == word2]
    count = 0
    for word1_index in word1_indices:
        if not any(abs(word1_index - word2_index) <= proximity for word2_index in word2_indices):
            count += 1
    return count 

word = 'money'
not_word = 'fund'
proximity = 2

pandasDF['money_NOT_NEAR_account_count'] = pandasDF['text'].apply(lambda x: count_word_proximity(x, word, not_word, proximity))
 
print(pandasDF)

       id                                               text    category  \
0  ABC123       how much money do i have in my money account  money talk   
1  ABC456                                  where is my money  money talk   
2  ABC789         hello  how are you today what is your name  money talk   
3  DEF123  my money market fund is my only money of my ac...  money talk   
4  DEF456  is my money now in my account i need to know i...  money talk   

  money_position  money_count  my_money_count  my_money_account_count  \
0         [2, 8]            2               1                       1   
1            [3]            1               1                       0   
2             []            0               0                       0   
3         [1, 7]            2               1                       0   
4        [2, 18]            2               2                       0   

   my_NEAR_money_count  money_NEAR_account_count  my_money_NEAR_my_account  \
0                    1    

### Post-Dataframe clean-up

In [41]:
# Remove 'text' column
pandasDF.drop(columns = ['text'], axis = 1, inplace = True) 

# Remove positions columns
pandasDF.drop(columns = ['money_position'], axis = 1, inplace = True)

# Remove records with no results (zero hits)
# Use this when datatype = Integer
pandasDF = pandasDF[pandasDF.money_count + pandasDF.my_money_count \
            + pandasDF.my_money_account_count + pandasDF.my_NEAR_money_count \
            + pandasDF.money_NEAR_account_count + pandasDF.money_NOT_NEAR_account_count != 0]

print(pandasDF)

       id    category  money_count  my_money_count  my_money_account_count  \
0  ABC123  money talk            2               1                       1   
1  ABC456  money talk            1               1                       0   
3  DEF123  money talk            2               1                       0   
4  DEF456  money talk            2               2                       0   

   my_NEAR_money_count  money_NEAR_account_count  \
0                    1                         1   
1                    1                         0   
3                    3                         1   
4                    3                         0   

   money_NOT_NEAR_account_count  my_money_NEAR_my_account  \
0                             2                         0   
1                             1                         0   
3                             1                         0   
4                             2                         2   

   my_money_NEAR_my_account_alt  
0       

In [30]:
https://stackoverflow.com/questions/75078466/find-word-near-other-word-within-n-of-words/75080687?noredirect=1#comment132504669_75080687

SyntaxError: invalid syntax (<ipython-input-30-e8a891ca3718>, line 1)

In [4]:
import re

df = pd.DataFrame(data)

def find_word_positions(string, word):
    positions = []
    for match in re.finditer(word, string):
        positions.append(match.start())
    return positions

df['Word position'] = df['text'].apply(find_word_positions, word='money')

print(df)

       id                                             text Word position
0  ABC123  How much money do I have in my money account?\r       [9, 31]
1  ABC456                             Where is my money?\r          [12]
2  ABC789   Hello, how are you today? What is your name?\r            []


## Enumerate Method: Efficient way to search string in a large text file

If the file is large, reading the whole file in memory is not ideal.

In [43]:
string = 'money'

with open(r"C:\Users\tshob\OneDrive\Data Science\Python\Projects\NLP\Call Transcripts.txt", 'r') as fp:
    for line_number, text in enumerate(fp):
        # search string
        if string in text:
            print(string)
            print('Line Number:', line_number)
            print('Text:', text)
            # don't look for next lines
            break

money
Line Number: 0
Text: Hello, this is Hal. How may I help you? I need to know what my account money is. Ok, I can help you with that. What is your account number? I don't know. What about your social? I prefer to not give that out. Ok, no problem. Have a nice day!"



In [23]:
Data.loc[:, "text"]

0    "Hello, my name is Hal. I need to know my acco...
1    "Hi, I would like to place a stock order. Ok, ...
Name: text, dtype: object

In [42]:
string = 'money'

with open(r"C:\Users\tshob\OneDrive\Data Science\Python\Projects\NLP\Call Transcripts.txt", 'r') as fp:
    for line_number, text in enumerate(fp):
        # search string
        if string in text:
            print(string)
            print('Line Number:', line_number)
            print('Text:', text)
            # don't look for next lines
            break

money
Line Number: 1
Text: How much money do I have?



## mmap to search for a string in text file

Fastest and most memory-efficient way to search a string in a large text file.

In [48]:
import mmap

with open(r'C:\Users\tshob\OneDrive\Data Science\Python\Projects\NLP\Call Transcripts.txt', 'rb', 0) as file:
    s = mmap.mmap(file.fileno(), 0, access=mmap.ACCESS_READ)
    if s.find(b'money') != -1:
        print('string exist in a file')

string exist in a file


## Fuzzy Search

In [4]:
!pip install fuzzysearch



In [50]:
from fuzzysearch import find_near_matches

In [52]:
# search for 'PATTERN' with a maximum Levenshtein Distance of 1

my_string = 'Hello, my name is Hal. What is your name?'
matches = find_near_matches('my', my_string, max_l_dist=1)

print([my_string[m.start:m.end] for m in matches])

['my', 'me', 'y', 'me']


## Search file for a list of strings

In [1]:
words = ['account', 'money']
with open(r'C:\Users\tshob\OneDrive\Data Science\Python\Projects\NLP\Call Transcripts.txt', 'r') as f:
    content = f.read()
# Iterate list to find each word
for word in words:
    if word in content:
        print('string exist in a file')

string exist in a file
string exist in a file


## Method 1: Get the position of a character in Python using rfind()

In [54]:
string = 'How much money is in my money account?'
letter = 'money'
print(string.rfind(letter))

24


## Method 2: Get position of character using regex

In [56]:
import re
string = 'How much money is in my money account?'
pattern = 'money'
match=(re.search(pattern, string))
 
#getting the starting index using match.start()
print ("starting index", match.start())
 
#Getting the start and end index in tuple format using match.span()
print ("start and end index", match.span())

starting index 9
start and end index (9, 14)


## Method 3: Get the position of a character in Python using index()

In [58]:
# Initializing string
ini_string1 = 'How much money is in my money account?'
 
# Character to find
c = "money"
# printing initial string and character
print ("initial_strings : ", ini_string1,
             "\ncharacter_to_find : ", c)
 
# Using index Method
try:
    res = ini_string1.index(c)
    print ("Character {} in string {} is present at {}".format(
                                  c, ini_string1, str(res + 1)))
except ValueError as e:
    print ("No such character available in string {}".format(ini_string1))

initial_strings :  How much money is in my money account? 
character_to_find :  money
Character money in string How much money is in my money account? is present at 10


## Method 4: Get the position of a character in Python using the loop 

In [60]:
# Initializing string
ini_string = 'How much money is in my money account?'
 
# Character to find
c = "money"
# printing initial string and character
print("initial_string : ", ini_string, "\ncharacter_to_find : ", c)
 
# Using Naive Method
res = None
for i in range(0, len(ini_string)):
    if ini_string[i] == c:
        res = i + 1
        break
 
if res == None:
    print("No such character available in string")
else:
    print("Character {} is present at {}".format(c, str(res)))

initial_string :  How much money is in my money account? 
character_to_find :  money
No such character available in string


## Method 5: Get the position of a character in Python using str.find

In [61]:
# Initializing string
ini_string = 'How much money is in my money account?'
ini_string2 = 'xyze'
 
# Character to find
c = "money"
# printing initial string and character
print("initial_strings : ", ini_string, " ",
      ini_string2, "\ncharacter_to_find : ", c)
 
# Using find Method
res1 = ini_string.find(c)
res2 = ini_string2.find(c)
 
if res1 == -1:
    print("No such character available in string {}".format(
        ini_string))
else:
    print("Character {} in string {} is present at {}".format(
        c, ini_string, str(res1 + 1)))
 
if res2 == -1:
    print("No such character available in string {}".format(
        ini_string2))
else:
    print("Character {} in string {} is present at {}".format(
        c, ini_string2, str(res2 + 1)))

initial_strings :  How much money is in my money account?   xyze 
character_to_find :  money
Character money in string How much money is in my money account? is present at 10
No such character available in string xyze


In [62]:
myStr = "I am pythonforbeginners. I provide free python tutorials for you to learn python."
substring = "python"
str_len = len(myStr)
sub_len = len(substring)
sub_indices = []
for i in range(str_len - sub_len):
    if myStr[i:i + sub_len] == substring:
        sub_indices.append(i)
print("The string is:", myStr)
print("The substring is:", substring)
print("The starting indices of the occurrences of {} in the string are:{}".format(substring, sub_indices))

The string is: I am pythonforbeginners. I provide free python tutorials for you to learn python.
The substring is: python
The starting indices of the occurrences of python in the string are:[5, 40, 74]


In [64]:

# Python3 code to demonstrate working of
# All occurrences of substring in string
# Using list comprehension + startswith()
 
# initializing string
test_str = "GeeksforGeeks is best for Geeks"
 
# initializing substring
test_sub = "Geeks"
 
# printing original string
print("The original string is : " + test_str)
 
# printing substring
print("The substring to find : " + test_sub)
 
# using list comprehension + startswith()
# All occurrences of substring in string
res = [i for i in range(len(test_str)) if test_str.startswith(test_sub, i)]
 
# printing result
print("The start indices of the substrings are : " + str(res))

The original string is : GeeksforGeeks is best for Geeks
The substring to find : Geeks
The start indices of the substrings are : [0, 8, 26]


In [65]:

# Python3 code to demonstrate working of
# All occurrences of substring in string
# Using re.finditer()
import re
 
# initializing string
test_str = "GeeksforGeeks is best for Geeks"
 
# initializing substring
test_sub = "Geeks"
 
# printing original string
print("The original string is : " + test_str)
 
# printing substring
print("The substring to find : " + test_sub)
 
# using re.finditer()
# All occurrences of substring in string
res = [i.start() for i in re.finditer(test_sub, test_str)]
 
# printing result
print("The start indices of the substrings are : " + str(res))

The original string is : GeeksforGeeks is best for Geeks
The substring to find : Geeks
The start indices of the substrings are : [0, 8, 26]


In [66]:
# Python3 code to demonstrate working of
# All occurrences of substring in string
 
# initializing string
test_str = "GeeksforGeeks is best for Geeks"
 
# initializing substring
test_sub = "Geeks"
 
# printing original string
print("The original string is : " + test_str)
 
# printing substring
print("The substring to find : " + test_sub)
res=[]
while(test_str.find(test_sub)!=-1):
    res.append(test_str.find(test_sub))
    test_str=test_str.replace(test_sub,"*"*len(test_sub),1)
 
# printing result
print("The start indices of the substrings are : " + str(res))

The original string is : GeeksforGeeks is best for Geeks
The substring to find : Geeks
The start indices of the substrings are : [0, 8, 26]


In [19]:
I need a function that identifies instances where 'Word 1' is within ***N***# words of 'Word 2'

For example, here is my dataframe and objective:

**PANDAS DATAFRAME**
| Record ID       | String         |
| -------- | -------------- |
| ABC123   | This is the first example sentence the end of sentence one   |
| ABC456   | This is the second example sentence one more sentence to come    |
| ABC789   | There are no more example sentences    |

Word 1 = 'sentence' <br />
Word 2 = 'the' <br />
***N***# of words displacement = 3

-- Apply **enumerate regex** function ? -- <br />

**OUTPUT**
| Record ID       | Occurrences Identified        |
| -------- | -------------- |
| ABC123   | 3    |
| ABC456   | 1    |
| ABC789   | 0    |


**BONUS (nice to have, but not necessary)**
**OUTPUT**
| Record ID       | Occurrences Identified | Word 1 position |
| -------- | -------------- | ------ |
| ABC123   | 3    | [5, 9]
| ABC456   | 1    | [5]
| ABC789   | 0    | [0]

Append an extra dataframe column (list format) showing the word position for the each Word 1 that resulted in a positive hit, but only for records (rows) with at least one identified occurrence. 

SyntaxError: invalid syntax (<ipython-input-19-ce4bd080a8a2>, line 1)

In [None]:
I need a function that identifies instances in a string when 'Word 1' is within ***N***# words of 'Word 2'

For example, here is my dataframe and objective:

I need a function that identifies instances where 'Word 1' is within ***N***# words of 'Word 2'

For example, here is my dataframe and objective:

**PANDAS DATAFRAME**
| Record ID       | String         |
| -------- | -------------- |
| ABC123   | This is the first example sentence the end of sentence one   |
| ABC456   | This is the second example sentence one more sentence to come    |
| ABC789   | There are no more example sentences    |

Word 1 = 'sentence' <br />
Word 2 = 'the' <br />
***N***# of words displacement = 3 <br />

-- Apply **enumerate regex** function ? -- 

**OUTPUT**
| Record ID       | Occurrences Identified        |
| -------- | -------------- |
| ABC123   | 3    |
| ABC456   | 1    |
| ABC789   | 0    |

If there's a different function besides regex that would be much simpler, please share.

I can't provide any code attempts because I couldn't figure out where to start, and I uploaded this image of a hypothetical, static table because StackOverflow wasn't allowing me to post it the proper way.

In [15]:
input_data = [['ABC123', 'This is the first example sentence the end of sentence one'], ['ABC456', 'This is the second example sentence one more sentence to come'], ['ABC789', 'There are no more example sentences']]
df1 = pd.DataFrame(input_data, columns=['Record ID', 'String'])
print(df1)

  Record ID                                             String
0    ABC123  This is the first example sentence the end of ...
1    ABC456  This is the second example sentence one more s...
2    ABC789                There are no more example sentences


In [16]:
output_data = [['ABC123', 3], ['ABC456', 1], ['ABC789', 0]]
df2 = pd.DataFrame(output_data, columns=['Record ID', 'Occurrences Identified'])
print(df2)

  Record ID  Occurrences Identified
0    ABC123                       3
1    ABC456                       1
2    ABC789                       0


In [14]:
I need a function that identifies instances in a string when 'Word 1' is within ***N***# words of 'Word 2'

For example, here is my dataframe and objective:

**Pandas Dataframe Input** <br />

    data = [['ABC123', 'This is the first example sentence the end of sentence one'], ['ABC456', 'This is the second example sentence one more sentence to come'], ['ABC789', 'There are no more example sentences']]
    df = pd.DataFrame(data, columns=['Record ID', 'String'])
    print(df)

    Record ID | String
    ----------|-----------------------
    ABC123    | This is the first example sentence the end of sentence one
    ABC456    | This is the second example sentence one more sentence to come
    ABC456    | There are no more example sentences

Word 1 = 'sentence' <br />
Word 2 = 'the' <br />
Within ***N***# of words (displaced) = 3 <br />

**Desired Dataframe Output** <br />

    output_data = [['ABC123', 3], ['ABC456', 1], ['ABC789', 0]]
    df = pd.DataFrame(output_data, columns=['Record ID', 'Occurrences Identified'])
    print(df)

    Record ID | Occurrences Identified
    ----------|-----------------------
    ABC123    | 3
    ABC456    | 1
    ABC456    | 0

I think the regex part will take the general form of this, but I'm not sure how to apply it towards my use-case here in Python and ... I'm not sure where to start with a enumerate function.

    \b(?:'sentence'\W+(?:\w+\W+){0,3}?'the'|'the'\W+(?:\w+\W+){0,3}?'sentence')\b

If there's a different function besides regex that would be much simpler, please share.

I appreciate any help, thanks!


SyntaxError: invalid syntax (<ipython-input-14-ef4f62ba3f6a>, line 1)

In [22]:
input_data = [['ABC123', 'This is the first example sentence the end of sentence one'], ['ABC456', 'This is the second example sentence one more sentence to come'], ['ABC789', 'There are no more example sentences']]
df1 = pd.DataFrame(input_data, columns=['Record ID', 'String'])
print(df1)

  Record ID                                             String
0    ABC123  This is the first example sentence the end of ...
1    ABC456  This is the second example sentence one more s...
2    ABC789                There are no more example sentences


In [24]:
def word(target, source):
    for i, w in enumerate(source):
        if w == target:
            return source[i + 1]

print(word('sentence', df1['String']))

None


In [None]:
def word(target, source):
    for i, w in enumerate(source):
        if w == target:
            return source[i - 1], source[i - 2], source[i - 3], source[i + 1], source[i + 2], source[i + 3]

print(word('sentence', df1['String']))

In [122]:
data = [['ABC123', 'can you help me with this'], ['ABC456', 'I can not help with this, but they can help'] \
       , ['ABC789', 'No one can help can they'], ['DEF123', 'They were able to help']]
pandasDF = pd.DataFrame(data, columns=['Record ID', 'String'])
print(pandasDF)

  Record ID                                       String
0    ABC123                    can you help me with this
1    ABC456  I can not help with this, but they can help
2    ABC789                     No one can help can they
3    DEF123                       They were able to help


In [41]:
import numpy as np

def identify_instances(string, word_1 = 'can', word_2 = 'help', allowed_distance = 3):
    string_list = np.array(string.split(' '))
    
    # Find matches for first word
    first_word = string_list == word_1
    
    # Find matches for second word
    second_word = string_list == word_2 
    
    # Find matches for both words
    matches = first_word[None] & second_word[:, None]
    
    # Find indices of the above matches
    indices = np.stack(np.where(matches)).T
    
    # in-place
    indices.sort()
    
    # Limit it to the close matches
    indices = indices[(indices[:, 1] - indices[:,0]) <= allowed_distance]
    
    # Pair list format
    return [tuple(i) for i in indices]

# Apply search function and insert search word
pandasDF['can_NEAR_help_position'] = pandasDF['String'].apply(identify_instances)
 
print(pandasDF)

  Record ID                                       String  \
0    ABC123                    can you help me with this   
1    ABC456  I can not help with this, but they can help   
2    ABC789                     No one can help can they   
3    DEF123                       They were able to help   

  can_NEAR_help_position  
0               [(0, 2)]  
1       [(1, 3), (8, 9)]  
2       [(2, 3), (3, 4)]  
3                     []  


In [128]:
data = [['ABC123', 'can you help me with this'], ['ABC456', 'I can not help with this, but they can help'] \
       , ['ABC789', 'No one can help can they'], ['DEF123', 'They were able to help']]
pandasDF = pd.DataFrame(data, columns=['Record ID', 'String'])
print(pandasDF)

  Record ID                                       String
0    ABC123                    can you help me with this
1    ABC456  I can not help with this, but they can help
2    ABC789                     No one can help can they
3    DEF123                       They were able to help


In [129]:
def indices_1(string, word_1 = 'can', word_2 = 'help', allowed_distance = 3):
    string = string.split()
    indices_word_1 = [i for i, x in enumerate(string) if x == word_1]
    indices_word_2 = [i for i, x in enumerate(string) if x == word_2]
    for i in indices_word_1:
        for j in indices_word_2:
            _distance = abs(i - j)
            if _distance <= allowed_distance:
                return indices_word_2

def indices_2(string, search_word):   
    string = string.split()
    return [i for i, x in enumerate(string) if x == search_word]

pandasDF['help_position_NEAR_can'] = pandasDF['String'].apply(indices_1)

pandasDF['help_position'] = pandasDF['String'].apply(indices_2, search_word = 'help')

print(pandasDF)

  Record ID                                       String  \
0    ABC123                    can you help me with this   
1    ABC456  I can not help with this, but they can help   
2    ABC789                     No one can help can they   
3    DEF123                       They were able to help   

  help_position_NEAR_can help_position  
0                    [2]           [2]  
1                 [3, 9]        [3, 9]  
2                    [3]           [3]  
3                   None           [4]  


In [5]:
import pandas as pd

data = [['ABC123', 'can you help me with this'], ['ABC456', 'I can not help with this, but they can help'] \
       , ['ABC789', 'No one can help can they'], ['DEF123', 'They were able to help']]
pandasDF = pd.DataFrame(data, columns=['Record ID', 'String'])
print(pandasDF)

  Record ID                                       String
0    ABC123                    can you help me with this
1    ABC456  I can not help with this, but they can help
2    ABC789                     No one can help can they
3    DEF123                       They were able to help


In [None]:
def count_word_proximity(sentence, word1, word2, proximity):
    word1_indices = [i for i, word in enumerate(sentence.split()) if word == word1]
    word2_indices = [i for i, word in enumerate(sentence.split()) if word == word2]
    count = 0
    for word1_index in word1_indices:
        if not any(abs(word1_index - word2_index) <= proximity for word2_index in word2_indices):
            count += 1
    return count 

word1 = "help"
word2 = "can"

proximity = 3

In [6]:
def indices_1(string, word_1 = 'can', word_2 = 'help', allowed_distance = 3):
    string = string.split()
    indices_word_1 = [i for i, x in enumerate(string) if x == word_1]
    indices_word_2 = [i for i, x in enumerate(string) if x == word_2]
    for i in indices_word_1:
        for j in indices_word_2:
            _distance = abs(i - j)
            if _distance <= allowed_distance:
                return indices_word_2

def indices_2(string, search_word):   
    string = string.split()
    return [i for i, x in enumerate(string) if x == search_word]

a = pandasDF['String'].apply(indices_1)
b = pandasDF['String'].apply(indices_2, search_word = 'help')

print(a)
print(b)

0       [2]
1    [3, 9]
2       [3]
3      None
Name: String, dtype: object
0       [2]
1    [3, 9]
2       [3]
3       [4]
Name: String, dtype: object


In [7]:
pandasDF.dtypes

Record ID    object
String       object
dtype: object

In [8]:
import numpy

arr = numpy.array(a)
arr = numpy.array(b)

print(b)

0       [2]
1    [3, 9]
2       [3]
3       [4]
Name: String, dtype: object


In [9]:
a = tuple(a)
b = tuple(b)

In [10]:
for i in a:
   if i in b:
      b.remove(i)

print(b)

AttributeError: 'tuple' object has no attribute 'remove'

In [None]:
for i in b:
   if i in pandasDF['String'].apply(indices_2, search_word = 'help'):
      a.remove(i)

print(a)

In [136]:
a = [0, 1, 2, 4, 5]
b = [1, 3, 4, 5, 7]

for i in b:
   if i in a:
      a.remove(i)

print(a)

[0, 2]


In [109]:
pandasDF['help_position'].dtypes

dtype('O')

In [114]:
del pandasDF['help_position'][3]

print(pandasDF)

  Record ID                                       String  \
0    ABC123                    can you help me with this   
1    ABC456  I can not help with this, but they can help   
2    ABC789                     No one can help can they   
3    DEF123                       They were able to help   

  can_NEAR_help_position help_position  
0                    [2]           [2]  
1                 [3, 9]        [3, 9]  
2                    [3]           [3]  
3                   None           [4]  


In [115]:
a = [1,2,3,4,5,6]
b = [2,3]

for i in b:
   if i in a:
      a.remove(i)

print(a)

[1, 4, 5, 6]


In [96]:
myList = ["Bran", 11, 22, 33, "Stark", 22, 33, 11]
 
del myList[2]
 
myList

['Bran', 11, 33, 'Stark', 22, 33, 11]

In [92]:
list1 = pandasDF['can_NEAR_help_position']
list2 = pandasDF['help_position']       
             
def common_elements(list1, list2):
    return [element for element in list1 if element in list2]

lst1 = [23, 15, 2, 14, 14, 16, 20 ,52]
lst2 = [2, 48, 15, 12, 26, 32, 47, 54]

print(common_elements(lst1, lst2))

[15, 2]


In [94]:
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3
 
# Driver Code
lst1 = pandasDF['can_NEAR_help_position']
lst2 = pandasDF['help_position']

print(intersection(lst1, lst2))

[15, 2]


In [None]:
import operator as op

def common_member(a, b):
    result = [i for i in a if op.countOf(b,i) > 0]
    return result

pandasDF['common'] = pandasDF['help_position'].apply(common_member, a = pandasDF['help_position'])

In [74]:
def common_member(a, b):
    result = [i for i in a if i in b]
    return result
 
a = tuple(pandasDF['can_NEAR_help_position'])
b = pandasDF['help_position']
 
print("The common elements in the two lists are: ")
print(common_member(a, b))

The common elements in the two lists are: 


TypeError: unhashable type: 'list'

In [70]:
import collections

def common_member(a, b):
    result = collections.Counter(a) & collections.Counter(b)
    return result.keys()
  
a = hash(tuple(pandasDF['can_NEAR_help_position']))
b = hash(tuple(pandasDF['help_position']))

print(common_member(a, b))

TypeError: unhashable type: 'list'

In [62]:
# Importing reduce function
from functools import reduce

# Finding a common element
result = list(reduce(set.intersection, map(set, [pandasDF.can_NEAR_help_position, pandasDF.help_position])))

# Display Result
print("Common element:\n",result)

TypeError: unhashable type: 'list'

In [73]:
data = [['ABC123', 'how much money do i have in my money account'], ['ABC456', 'where is my money '] \
        , ['ABC789', 'hello  how are you today what is your name'], ['DEF123', 'my money market fund is my only money of my account'] \
        , ['DEF456', 'is my money now in my account i need to know if my account has any of my money'] \
        , ['DEF789', 'is my next to after to my account then from to is my and then next is my next my account']
        , ['GHI123', 'test word one in practice sentence practice sentence two contains this test word two here is test word three in my last practice sentence']
        , ['GHI456', 'test sentence one test word one practice sentence two two test word two two practice sentence three three three test word four four four four practice sentence five five five five five test word five five five five five practice sentence']]    

pandasDF = pd.DataFrame(data, columns=['id', 'text'])

print(pandasDF)

       id  \
0  ABC123   
1  ABC456   
2  ABC789   
3  DEF123   
4  DEF456   
5  DEF789   
6  GHI123   
7  GHI456   

                                                                                                                                                                                                                                            text  
0  how much money do i have in my money account                                                                                                                                                                                                   
1  where is my money                                                                                                                                                                                                                              
2  hello  how are you today what is your name                                                                                                            