Skip to content

Commit

Permalink
e
Browse files Browse the repository at this point in the history
  • Loading branch information
ym001 committed May 17, 2020
1 parent 9bb15e4 commit 5295386
Show file tree
Hide file tree
Showing 17 changed files with 333 additions and 36 deletions.
6 changes: 6 additions & 0 deletions Exemples/exemple_Dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,10 @@
import exemple_Dataset5
import exemple_Dataset6
import exemple_Dataset7
import exemple_Dataset8
import exemple_Dataset9
import exemple_Dataset10
import exemple_Dataset11
import exemple_Dataset12
import exemple_Dataset13

12 changes: 12 additions & 0 deletions Exemples/exemple_Dataset12.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from Manteia.Dataset import Dataset

ds=Dataset('Amazon Review Polarity',test=True,desc=True)

print('Train : ')
print(ds.documents_train[:5])
print(ds.labels_train[:5])
print(ds.documents_test[:5])
print(ds.labels_test[:5])
print(ds.description)


16 changes: 16 additions & 0 deletions Exemples/exemple_Dataset13.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from Manteia.Dataset import Dataset

ds=Dataset('Amazon Review Full',test=True,desc=True)

print('Train : ')
print(ds.documents_train[:5])
print(ds.labels_train[:5])

print('Test : ')
print(ds.documents_test[:5])
print(ds.labels_test[:5])

print('Description :')
print(ds.description)


6 changes: 6 additions & 0 deletions Exemples/exemple_Dataset14.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from Manteia.Dataset import Dataset

ds=Dataset('Short_Jokes')

print('Train : ')
print(ds.documents_train[:5])
7 changes: 7 additions & 0 deletions Exemples/exemple_Dataset15.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from Manteia.Dataset import Dataset

ds=Dataset('Tweeter Airline Sentiment')

print('Train : ')
print(ds.documents_train[:5])
print(ds.labels_train[:5])
172 changes: 145 additions & 27 deletions Manteia/Dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,23 +26,17 @@
class Dataset:

r"""
This is the class to give datasets.
This is the class description in order to get some dataset.
* **name** - name of the dataset (str)
* **train** - load the dataset train Default: ‘True’.
* **test** - load the dataset test Default: ‘False’.
* **dev** - load the dataset dev Default: ‘False’.
* **description** - load description Default: ‘False’.
* **url** -
* **verbose** -
* **verbose** - produce and display some explanation
* **path** - Path to the data file.
.. code-block:: python
print('hello')
"""
def __init__(self,name='20newsgroups',train=True,test=False,dev=False,classe=False,desc=False,path='./dataset',verbose=True):
r"""
Expand Down Expand Up @@ -102,6 +96,13 @@ def load(self):
self.load_Yelp_Review_Full()
if self.name=="Yelp Review Polarity":
self.load_Yelp_Review_Polarity()

if self.name=="Short_Jokes":
self.load_Short_Jokes()

if self.name=="Tweeter Airline Sentiment":
self.load_Tweeter_Airline_Sentiment()


def load_20newsgroups(self):
r"""
Expand Down Expand Up @@ -349,7 +350,27 @@ def load_Sogou_News(self):


def load_Amazon_Review_Polarity(self):

"""
Defines Amazon Review Polarity datasets.
The labels includes:
* 1 : Negative polarity.
* 2 : Positive polarity.
.. code-block:: python
from Manteia.Dataset import Dataset
ds=Dataset('Amazon Review Polarity',test=True,desc=True)
print('Train : ')
print(ds.documents_train[:5])
print(ds.labels_train[:5])
print(ds.documents_test[:5])
print(ds.labels_test[:5])
print(ds.description)
"""
self.path_dir = os.path.join(self.path,'amazon_review_polarity')
#!!!!!!!!!!!!!!!!!!!!
#self.del_dir(self.path_dir)
Expand Down Expand Up @@ -379,10 +400,32 @@ def load_Amazon_Review_Polarity(self):
self.description+=row

def load_Amazon_Review_Full(self):

r"""
Defines Amazon Review Full Star Dataset.
The labels includes:
**1 - 5** : rating classes (5 is highly recommended).
.. code-block:: python
from Manteia.Dataset import Dataset
ds=Dataset('Amazon Review Full',test=True,desc=True)
print('Train : ')
print(ds.documents_train[:5])
print(ds.labels_train[:5])
print('Test : ')
print(ds.documents_test[:5])
print(ds.labels_test[:5])
print('Description :')
print(ds.description)
"""
self.path_dir = os.path.join(self.path,'amazon_review_full')
#!!!!!!!!!!!!!!!!!!!!
#self.del_dir(self.path_dir)
self.del_dir(self.path_dir)
#!!!!!!!!!!!!!!!!!!!!

if not os.path.isdir(self.path_dir):
Expand Down Expand Up @@ -914,9 +957,83 @@ def load_pubmed_rct20k(self):
self.documents_dev.append(row[1])
self.labels_dev.append(row[0])

def load_Short_Jokes(self):

r"""
Defines Short_Jokes dataset.
.. code-block:: python
from Manteia.Dataset import Dataset
ds=Dataset('pubmed_rct20k')
print('Train : ')
print(ds.documents_train[:5])
print(ds.labels_train[:5])
"""
self.documents_train = []

path_dir=os.path.join(self.path,'Short_Jokes')
if not os.path.isdir(path_dir):
os.mkdir(path_dir)
url_train = 'https://github.com/ym001/Dune/raw/master/datasets/short-jokes.zip'
if self.verbose:
print("Downloading and extracting Short_Jokes...")
download_and_extract(url_train, path_dir)
if self.train:
path_file=os.path.join(path_dir,'shortjokes.csv')
fi = open(path_file, "r")
rows = fi.readlines()
for row in rows:
row=row.split(',')
if len(row)==2:
self.documents_train.append(row[1].strip())


def load_Tweeter_Airline_Sentiment(self):

r"""
Defines Tweeter Airline Sentiment dataset.
The labels includes:
* positive.
* neutral.
* negative.
.. code-block:: python
from Manteia.Dataset import Dataset
ds=Dataset('Tweeter Airline Sentiment')
print('Train : ')
print(ds.documents_train[:5])
print(ds.labels_train[:5])
"""
self.documents_train = []
self.labels_train = []

path_dir=os.path.join(self.path,'Tweeter_Airline_Sentiment')
if not os.path.isdir(path_dir):
os.mkdir(path_dir)
url_train = 'https://github.com/ym001/Dune/raw/master/datasets/Airline-Sentiment.zip'
if self.verbose:
print("Downloading and extracting Tweeter_Airline_Sentiment...")
download_and_extract(url_train, path_dir)
if self.train:
path_file=os.path.join(path_dir,'Airline-Sentiment.csv')
fi = open(path_file, "r")
reader = csv.DictReader(fi, delimiter = ',')
for row in reader:
self.documents_train.append(row['text'])
self.labels_train.append(row['airline_sentiment'])

def download_and_extract(url, data_dir):
"""
download_and_extract file of dataset.
"""
data_file = "temp.zip"
if os.path.isfile(data_file):
os.remove(data_file)
Expand All @@ -930,23 +1047,24 @@ def download_and_extract(url, data_dir):
#clean
if os.path.isfile(data_file):
os.remove(data_file)
"""
del directorie and is content.
"""

def clear_folder(dir):
print('clear : '+dir)
if os.path.exists(dir):
for the_file in os.listdir(dir):
file_path = os.path.join(dir, the_file)
try:
if os.path.isfile(file_path):
os.unlink(file_path)
else:
clear_folder(file_path)
os.rmdir(file_path)
except Exception as e:
print(e)
os.rmdir(dir)
"""
Del directorie and is content.
"""
print('clear : '+dir)
if os.path.exists(dir):
for the_file in os.listdir(dir):
file_path = os.path.join(dir, the_file)
try:
if os.path.isfile(file_path):
os.unlink(file_path)
else:
clear_folder(file_path)
os.rmdir(file_path)
except Exception as e:
rint(e)
os.rmdir(dir)

def construct_sample(path_train,classes=None):
documents_train,labels_train = [],[]
Expand Down
5 changes: 5 additions & 0 deletions docs/Dataset.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,8 @@ Dataset

.. automodule:: Manteia.Dataset
:members:

See :cite:`nelson` for an introduction to stylish blah, blah...

.. bibliography:: biblio.bib
:style: unsrt
Binary file modified docs/_build/doctrees/Dataset.doctree
Binary file not shown.
Binary file modified docs/_build/doctrees/environment.pickle
Binary file not shown.
2 changes: 1 addition & 1 deletion docs/_build/html/.buildinfo
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Sphinx build info version 1
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
config: 912835c1596e52fe83e70155f1c21020
config: dcf37aa49cdba32acf9f450a5060fee0
tags: 645f666f9bcd5a90fca523b33c5a78b7

0 comments on commit 5295386

Please sign in to comment.