Skip to content

Commit 636ea0b

Browse files
NLP Basics
1 parent 1426bfc commit 636ea0b

15 files changed

+32886
-0
lines changed
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [
8+
{
9+
"data": {
10+
"text/html": [
11+
"<div>\n",
12+
"<style scoped>\n",
13+
" .dataframe tbody tr th:only-of-type {\n",
14+
" vertical-align: middle;\n",
15+
" }\n",
16+
"\n",
17+
" .dataframe tbody tr th {\n",
18+
" vertical-align: top;\n",
19+
" }\n",
20+
"\n",
21+
" .dataframe thead th {\n",
22+
" text-align: right;\n",
23+
" }\n",
24+
"</style>\n",
25+
"<table border=\"1\" class=\"dataframe\">\n",
26+
" <thead>\n",
27+
" <tr style=\"text-align: right;\">\n",
28+
" <th></th>\n",
29+
" <th>label</th>\n",
30+
" <th>body_text</th>\n",
31+
" </tr>\n",
32+
" </thead>\n",
33+
" <tbody>\n",
34+
" <tr>\n",
35+
" <th>0</th>\n",
36+
" <td>ham</td>\n",
37+
" <td>I've been searching for the right words to tha...</td>\n",
38+
" </tr>\n",
39+
" <tr>\n",
40+
" <th>1</th>\n",
41+
" <td>spam</td>\n",
42+
" <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
43+
" </tr>\n",
44+
" <tr>\n",
45+
" <th>2</th>\n",
46+
" <td>ham</td>\n",
47+
" <td>Nah I don't think he goes to usf, he lives aro...</td>\n",
48+
" </tr>\n",
49+
" <tr>\n",
50+
" <th>3</th>\n",
51+
" <td>ham</td>\n",
52+
" <td>Even my brother is not like to speak with me. ...</td>\n",
53+
" </tr>\n",
54+
" <tr>\n",
55+
" <th>4</th>\n",
56+
" <td>ham</td>\n",
57+
" <td>I HAVE A DATE ON SUNDAY WITH WILL!!</td>\n",
58+
" </tr>\n",
59+
" </tbody>\n",
60+
"</table>\n",
61+
"</div>"
62+
],
63+
"text/plain": [
64+
" label body_text\n",
65+
"0 ham I've been searching for the right words to tha...\n",
66+
"1 spam Free entry in 2 a wkly comp to win FA Cup fina...\n",
67+
"2 ham Nah I don't think he goes to usf, he lives aro...\n",
68+
"3 ham Even my brother is not like to speak with me. ...\n",
69+
"4 ham I HAVE A DATE ON SUNDAY WITH WILL!!"
70+
]
71+
},
72+
"execution_count": 1,
73+
"metadata": {},
74+
"output_type": "execute_result"
75+
}
76+
],
77+
"source": [
78+
"import pandas as pd\n",
79+
"fullCorpus=pd.read_csv(\"SMSSpamCollection.tsv\",sep='\\t',header=None)\n",
80+
"fullCorpus.columns=['label','body_text']\n",
81+
"fullCorpus.head()"
82+
]
83+
},
84+
{
85+
"cell_type": "code",
86+
"execution_count": 2,
87+
"metadata": {},
88+
"outputs": [
89+
{
90+
"name": "stdout",
91+
"output_type": "stream",
92+
"text": [
93+
"no of rows : 5568\n",
94+
"no of columns : 2\n"
95+
]
96+
}
97+
],
98+
"source": [
99+
"# what is the shape of data set\n",
100+
"print('no of rows :',len(fullCorpus))\n",
101+
"print('no of columns :',len(fullCorpus.columns))"
102+
]
103+
},
104+
{
105+
"cell_type": "code",
106+
"execution_count": 4,
107+
"metadata": {},
108+
"outputs": [
109+
{
110+
"name": "stdout",
111+
"output_type": "stream",
112+
"text": [
113+
"total ham : 4822\n",
114+
"total spam : 746\n"
115+
]
116+
}
117+
],
118+
"source": [
119+
"# how many spam/ham are there\n",
120+
"print('total ham : ',len(fullCorpus[fullCorpus['label']=='ham']))\n",
121+
"print('total spam : ',len(fullCorpus[fullCorpus['label']=='spam']))"
122+
]
123+
},
124+
{
125+
"cell_type": "code",
126+
"execution_count": 5,
127+
"metadata": {},
128+
"outputs": [
129+
{
130+
"ename": "AttributeError",
131+
"evalue": "'DataFrame' object has no attribute 'isNull'",
132+
"output_type": "error",
133+
"traceback": [
134+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
135+
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
136+
"\u001b[0;32m<ipython-input-5-d2eb38269199>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# how much missing data is there\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfullCorpus\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misNull\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
137+
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 5177\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_can_hold_identifiers_and_holds_name\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5178\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5179\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5180\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5181\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
138+
"\u001b[0;31mAttributeError\u001b[0m: 'DataFrame' object has no attribute 'isNull'"
139+
]
140+
}
141+
],
142+
"source": [
143+
"# how much missing data is there\n",
144+
"len(fullCorpus.isNull())"
145+
]
146+
},
147+
{
148+
"cell_type": "code",
149+
"execution_count": null,
150+
"metadata": {},
151+
"outputs": [],
152+
"source": []
153+
}
154+
],
155+
"metadata": {
156+
"kernelspec": {
157+
"display_name": "Python 3",
158+
"language": "python",
159+
"name": "python3"
160+
},
161+
"language_info": {
162+
"codemirror_mode": {
163+
"name": "ipython",
164+
"version": 3
165+
},
166+
"file_extension": ".py",
167+
"mimetype": "text/x-python",
168+
"name": "python",
169+
"nbconvert_exporter": "python",
170+
"pygments_lexer": "ipython3",
171+
"version": "3.7.5"
172+
}
173+
},
174+
"nbformat": 4,
175+
"nbformat_minor": 2
176+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"cells": [],
3+
"metadata": {},
4+
"nbformat": 4,
5+
"nbformat_minor": 2
6+
}

0 commit comments

Comments
 (0)