e

ym001 · May 17, 2020 · 5295386 · 5295386
1 parent 9bb15e4
commit 5295386
Show file tree

Hide file tree

Showing 17 changed files with 333 additions and 36 deletions.
diff --git a/Exemples/exemple_Dataset.py b/Exemples/exemple_Dataset.py
@@ -32,4 +32,10 @@
 import exemple_Dataset5
 import exemple_Dataset6
 import exemple_Dataset7
+import exemple_Dataset8
+import exemple_Dataset9
+import exemple_Dataset10
+import exemple_Dataset11
+import exemple_Dataset12
+import exemple_Dataset13
 
diff --git a/Exemples/exemple_Dataset12.py b/Exemples/exemple_Dataset12.py
@@ -0,0 +1,12 @@
+from Manteia.Dataset import Dataset
+
+ds=Dataset('Amazon Review Polarity',test=True,desc=True)
+
+print('Train : ')
+print(ds.documents_train[:5])
+print(ds.labels_train[:5])
+print(ds.documents_test[:5])
+print(ds.labels_test[:5])
+print(ds.description)
+
+
diff --git a/Exemples/exemple_Dataset13.py b/Exemples/exemple_Dataset13.py
@@ -0,0 +1,16 @@
+from Manteia.Dataset import Dataset
+
+ds=Dataset('Amazon Review Full',test=True,desc=True)
+
+print('Train : ')
+print(ds.documents_train[:5])
+print(ds.labels_train[:5])
+
+print('Test : ')
+print(ds.documents_test[:5])
+print(ds.labels_test[:5])
+
+print('Description :')
+print(ds.description)
+
+
diff --git a/Exemples/exemple_Dataset14.py b/Exemples/exemple_Dataset14.py
@@ -0,0 +1,6 @@
+from Manteia.Dataset import Dataset
+
+ds=Dataset('Short_Jokes')
+
+print('Train : ')
+print(ds.documents_train[:5])
diff --git a/Exemples/exemple_Dataset15.py b/Exemples/exemple_Dataset15.py
@@ -0,0 +1,7 @@
+from Manteia.Dataset import Dataset
+
+ds=Dataset('Tweeter Airline Sentiment')
+
+print('Train : ')
+print(ds.documents_train[:5])
+print(ds.labels_train[:5])
diff --git a/Manteia/Dataset.py b/Manteia/Dataset.py
@@ -26,23 +26,17 @@
 class Dataset:
 
 	r"""
-		This is the class to give datasets.
+		This is the class description in order to get some dataset.
 		
 		
 		* **name**        - name of the dataset (str)
 		* **train**       - load the dataset train Default: ‘True’.
 		* **test**        - load the dataset test Default: ‘False’.
 		* **dev**         - load the dataset dev Default: ‘False’.
 		* **description** - load description Default: ‘False’.
-		* **url**         - 
-		* **verbose**     - 
+		* **verbose**     - produce and display some explanation
 		* **path**        - Path to the data file.
-			
-		.. code-block:: python
-
-			print('hello')
 		
-				 
 	"""
 	def __init__(self,name='20newsgroups',train=True,test=False,dev=False,classe=False,desc=False,path='./dataset',verbose=True):
 		r"""
@@ -102,6 +96,13 @@ def load(self):
 			self.load_Yelp_Review_Full()
 		if self.name=="Yelp Review Polarity":
 			self.load_Yelp_Review_Polarity()
+
+		if self.name=="Short_Jokes":
+			self.load_Short_Jokes()
+
+		if self.name=="Tweeter Airline Sentiment":
+			self.load_Tweeter_Airline_Sentiment()
+
 
 	def load_20newsgroups(self):
 		r"""
@@ -349,7 +350,27 @@ def load_Sogou_News(self):
 
 
 	def load_Amazon_Review_Polarity(self):
-
+		"""
+		Defines Amazon Review Polarity datasets.
+			The labels includes:
+			
+			* 1 : Negative polarity.
+
+			* 2 : Positive polarity.
+
+		.. code-block:: python
+
+			from Manteia.Dataset import Dataset
+
+			ds=Dataset('Amazon Review Polarity',test=True,desc=True)
+
+			print('Train : ')
+			print(ds.documents_train[:5])
+			print(ds.labels_train[:5])
+			print(ds.documents_test[:5])
+			print(ds.labels_test[:5])
+			print(ds.description)
+		"""
 		self.path_dir = os.path.join(self.path,'amazon_review_polarity')
 		#!!!!!!!!!!!!!!!!!!!!
 		#self.del_dir(self.path_dir)
@@ -379,10 +400,32 @@ def load_Amazon_Review_Polarity(self):
 				self.description+=row
 
 	def load_Amazon_Review_Full(self):
-
+		r"""
+		Defines Amazon Review Full Star Dataset.
+			The labels includes:
+			
+			**1 - 5** : rating classes (5 is highly recommended).
+
+		.. code-block:: python
+
+			from Manteia.Dataset import Dataset
+
+			ds=Dataset('Amazon Review Full',test=True,desc=True)
+
+			print('Train : ')
+			print(ds.documents_train[:5])
+			print(ds.labels_train[:5])
+
+			print('Test : ')
+			print(ds.documents_test[:5])
+			print(ds.labels_test[:5])
+
+			print('Description :')
+			print(ds.description)
+		"""
 		self.path_dir = os.path.join(self.path,'amazon_review_full')
 		#!!!!!!!!!!!!!!!!!!!!
-		#self.del_dir(self.path_dir)
+		self.del_dir(self.path_dir)
 		#!!!!!!!!!!!!!!!!!!!!
 
 		if not os.path.isdir(self.path_dir):
@@ -914,9 +957,83 @@ def load_pubmed_rct20k(self):
 					self.documents_dev.append(row[1])
 					self.labels_dev.append(row[0])
 
+	def load_Short_Jokes(self):
+
+		r"""
+		Defines Short_Jokes dataset.
+			
+
+		.. code-block:: python
+
+			from Manteia.Dataset import Dataset
 
+			ds=Dataset('pubmed_rct20k')
+
+			print('Train : ')
+			print(ds.documents_train[:5])
+			print(ds.labels_train[:5])
+		"""
+		self.documents_train = []
+
+		path_dir=os.path.join(self.path,'Short_Jokes')
+		if not os.path.isdir(path_dir):
+			os.mkdir(path_dir)
+			url_train = 'https://github.com/ym001/Dune/raw/master/datasets/short-jokes.zip'
+			if self.verbose:
+				print("Downloading and extracting Short_Jokes...")
+			download_and_extract(url_train, path_dir)
+		if self.train:
+			path_file=os.path.join(path_dir,'shortjokes.csv')
+			fi = open(path_file, "r")
+			rows = fi.readlines()
+			for row in rows:
+				row=row.split(',')
+				if len(row)==2:
+					self.documents_train.append(row[1].strip())
+
+
+	def load_Tweeter_Airline_Sentiment(self):
+
+		r"""
+		Defines Tweeter Airline Sentiment dataset.
+			The labels includes:
+			
+			* positive.
+			* neutral.
+			* negative.
+
+		.. code-block:: python
+
+			from Manteia.Dataset import Dataset
+
+			ds=Dataset('Tweeter Airline Sentiment')
+
+			print('Train : ')
+			print(ds.documents_train[:5])
+			print(ds.labels_train[:5])
+		"""
+		self.documents_train = []
+		self.labels_train = []
+
+		path_dir=os.path.join(self.path,'Tweeter_Airline_Sentiment')
+		if not os.path.isdir(path_dir):
+			os.mkdir(path_dir)
+			url_train = 'https://github.com/ym001/Dune/raw/master/datasets/Airline-Sentiment.zip'
+			if self.verbose:
+				print("Downloading and extracting Tweeter_Airline_Sentiment...")
+			download_and_extract(url_train, path_dir)
+		if self.train:
+			path_file=os.path.join(path_dir,'Airline-Sentiment.csv')
+			fi = open(path_file, "r")
+			reader = csv.DictReader(fi, delimiter = ',')
+			for row in reader:
+				self.documents_train.append(row['text'])
+				self.labels_train.append(row['airline_sentiment'])
 
 def download_and_extract(url, data_dir):
+		"""
+		download_and_extract file of dataset.
+		"""
 		data_file = "temp.zip"
 		if os.path.isfile(data_file):
 			os.remove(data_file)
@@ -930,23 +1047,24 @@ def download_and_extract(url, data_dir):
 		#clean
 		if os.path.isfile(data_file):
 			os.remove(data_file)
-"""
-del directorie and is content.
-"""
+
 def clear_folder(dir):
-    print('clear : '+dir)
-    if os.path.exists(dir):
-        for the_file in os.listdir(dir):
-            file_path = os.path.join(dir, the_file)
-            try:
-                if os.path.isfile(file_path):
-                    os.unlink(file_path)
-                else:
-                    clear_folder(file_path)
-                    os.rmdir(file_path)
-            except Exception as e:
-                print(e)
-        os.rmdir(dir)
+	"""
+	Del directorie and is content.
+	"""
+	print('clear : '+dir)
+	if os.path.exists(dir):
+		for the_file in os.listdir(dir):
+			file_path = os.path.join(dir, the_file)
+			try:
+				if os.path.isfile(file_path):
+					os.unlink(file_path)
+				else:
+					clear_folder(file_path)
+					os.rmdir(file_path)
+			except Exception as e:
+				rint(e)
+		os.rmdir(dir)
 
 def construct_sample(path_train,classes=None):
 	documents_train,labels_train = [],[]

diff --git a/docs/Dataset.rst b/docs/Dataset.rst
@@ -3,3 +3,8 @@ Dataset
 
 .. automodule:: Manteia.Dataset
     :members:
+
+See :cite:`nelson` for an introduction to stylish blah, blah...
+
+.. bibliography:: biblio.bib
+    :style: unsrt
diff --git a/docs/_build/doctrees/Dataset.doctree b/docs/_build/doctrees/Dataset.doctree
diff --git a/docs/_build/doctrees/environment.pickle b/docs/_build/doctrees/environment.pickle
diff --git a/docs/_build/html/.buildinfo b/docs/_build/html/.buildinfo
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: 912835c1596e52fe83e70155f1c21020
+config: dcf37aa49cdba32acf9f450a5060fee0
 tags: 645f666f9bcd5a90fca523b33c5a78b7