### Pandas equivalent example to work with Document Clustering for People Wiki dataframe

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#people = pd.read_csv(f"D:/people_wiki.csv")
people = pd.read_csv(f"D:/SYED/people_wiki.csv")
people.head(3)

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...


In [3]:
obama = people[people['name'] == 'Barack Obama']
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
matrix = vectorizer.fit_transform(obama['text'])
matrix

<1x270 sparse matrix of type '<class 'numpy.int64'>'
	with 270 stored elements in Compressed Sparse Row format>

In [4]:
counts = pd.DataFrame(matrix.toarray(), columns = vectorizer.get_feature_names_out())
counts.T.sort_values(by=0, ascending=False).head(10)

Unnamed: 0,0
the,40
in,30
and,21
of,18
to,14
his,11
obama,9
act,8
he,7
as,6


In [5]:
counts['the']

0    40
Name: the, dtype: int64

In [6]:
people.head(3)

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...


In [7]:
pep1 = people[10:100]
pep1.head(3)

Unnamed: 0,URI,name,text
10,<http://dbpedia.org/resource/Sophie_Crumb>,Sophie Crumb,sophia violet sophie crumb born september 27 1...
11,<http://dbpedia.org/resource/Jenn_Ashworth>,Jenn Ashworth,jenn ashworth is an english writer she was bor...
12,<http://dbpedia.org/resource/Jonathan_Hoefler>,Jonathan Hoefler,jonathan hoefler born august 22 1970 is an ame...


In [8]:
pep1.rename(columns = {'text':'text_data'}, inplace = True)
pep1.head(3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pep1.rename(columns = {'text':'text_data'}, inplace = True)


Unnamed: 0,URI,name,text_data
10,<http://dbpedia.org/resource/Sophie_Crumb>,Sophie Crumb,sophia violet sophie crumb born september 27 1...
11,<http://dbpedia.org/resource/Jenn_Ashworth>,Jenn Ashworth,jenn ashworth is an english writer she was bor...
12,<http://dbpedia.org/resource/Jonathan_Hoefler>,Jonathan Hoefler,jonathan hoefler born august 22 1970 is an ame...


## Function to make word-count as dict

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
def make_counts(text_data):
    cv_fit = cv.fit_transform(text_data)
    word_list = cv.get_feature_names_out()
    count_list = np.asarray(cv_fit.sum(axis=0))[0]
    final_dict = dict(zip(word_list, count_list))
    return final_dict

### Apply function to dataframe

In [11]:
pep1['word_count'] = pep1['text_data'].apply(lambda x: make_counts([x]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pep1['word_count'] = pep1['text_data'].apply(lambda x: make_counts([x]))


In [12]:
pep1['word_count'][15]

{'2006': 1,
 '2012': 1,
 '84minute': 1,
 'about': 1,
 'age': 1,
 'also': 3,
 'and': 10,
 'appearances': 1,
 'are': 1,
 'artist': 1,
 'as': 2,
 'atmospheres': 1,
 'attention': 2,
 'attracted': 2,
 'audiovisual': 1,
 'available': 1,
 'band': 1,
 'bands': 1,
 'based': 1,
 'berlinduring': 1,
 'but': 1,
 'camera': 1,
 'childhood': 1,
 'combines': 1,
 'common': 2,
 'content': 1,
 'contributes': 1,
 'creative': 2,
 'desert': 2,
 'design': 1,
 'designerartist': 1,
 'development': 1,
 'documentaries': 1,
 'documentary': 2,
 'drawing': 1,
 'due': 1,
 'during': 1,
 'edge': 1,
 'editor': 1,
 'electronic': 1,
 'elements': 1,
 'exist': 1,
 'explores': 1,
 'extraordinary': 1,
 'fans': 1,
 'feature': 1,
 'few': 1,
 'fictional': 1,
 'film': 3,
 'filmmaker': 1,
 'films': 2,
 'first': 1,
 'for': 2,
 'friends': 1,
 'from': 1,
 'fu': 1,
 'fuzzomentary': 1,
 'gap': 1,
 'generate': 1,
 'genre': 1,
 'genres': 1,
 'german': 1,
 'graphic': 2,
 'guest': 1,
 'have': 1,
 'he': 4,
 'his': 8,
 'homme': 1,
 'illustra