### Pandas equivalent example to work with Document Clustering for People Wiki dataframe

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#people = pd.read_csv(f"D:/people_wiki.csv")
people = pd.read_csv(f"D:/SYED/people_wiki.csv")
people.head(3)

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...


In [3]:
obama = people[people['name'] == 'Barack Obama']
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
matrix = vectorizer.fit_transform(obama['text'])
matrix

<1x270 sparse matrix of type '<class 'numpy.int64'>'
	with 270 stored elements in Compressed Sparse Row format>

In [4]:
counts = pd.DataFrame(matrix.toarray(), columns = vectorizer.get_feature_names_out())
counts.T.sort_values(by=0, ascending=False).head(10)

Unnamed: 0,0
the,40
in,30
and,21
of,18
to,14
his,11
obama,9
act,8
he,7
as,6


In [5]:
people.head(3)

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...


In [6]:
pep1 = people[0:100]
pep1.head(3)

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...


## Function to make word-count as dict

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
def make_counts(text_data):
    cv_fit = cv.fit_transform(text_data)
    word_list = cv.get_feature_names_out()
    count_list = np.asarray(cv_fit.sum(axis=0))[0]
    final_dict = dict(zip(word_list, count_list))
    return final_dict

### Apply function to dataframe

In [9]:
pep1['word_count'] = pep1['text'].apply(lambda x: make_counts([x]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pep1['word_count'] = pep1['text'].apply(lambda x: make_counts([x]))


In [18]:
pep1['name'][2]

'Harpdog Brown'

In [19]:
Harp = pep1.query("name == 'Harpdog Brown'")
Harp_idx = Harp.index
Harp

Unnamed: 0,URI,name,text,word_count
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...,"{'1982': 1, '1995': 1, '2010': 1, '2014': 2, '..."


In [11]:
obama['word_count'] = obama['text'].apply(lambda x: make_counts([x]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  obama['word_count'] = obama['text'].apply(lambda x: make_counts([x]))


## TF-IDF Calculation

from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer()
def make_tfidf(text_data):
    xval = v.fit_transform(text_data)
    names_val = v.get_feature_names_out()
    return names_val

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(stop_words='english')
tfidf_term_matrix = tfidf_vect.fit_transform(pep1.text)

In [31]:
pep1_tfidf = pd.DataFrame(
    data = tfidf_term_matrix[Harp_idx].toarray()[0],
    index = tfidf_vect.get_feature_names_out(),
    columns = ['tfidf']
)

In [32]:
pep1_tfidf.sort_values(by='tfidf', ascending=False)

Unnamed: 0,tfidf
blues,0.609974
best,0.182228
band,0.159269
harpdog,0.157768
harmonica,0.157768
...,...
espn,0.000000
especially,0.000000
esp,0.000000
esch,0.000000


In [33]:
pep1.head(3)

Unnamed: 0,URI,name,text,word_count
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...,"{'10': 1, '1979': 1, '19982000': 1, '2000': 1,..."
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,"{'1973': 1, '1981': 1, '2005': 1, '24hour': 1,..."
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...,"{'1982': 1, '1995': 1, '2010': 1, '2014': 2, '..."
